In [1]:
import os
import torch
from transformers import pipeline, AutoTokenizer, GenerationConfig, PhiForCausalLM
from langchain.document_loaders import TextLoader

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load and split local knowledge documents
Here we take the encyclopedia entry of 'Kuaizhou-1A' launched on January 11, 2024, as an example (https://baike.baidu.com/item/Kuaizhou-1A)

In [2]:
# Load local word vector model, using https://huggingface.co/BAAI/bge-base-zh
# model_name = "./data/BAAI_bge-base-zh"
model_name = "BAAI/bge-base-zh"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}

embedding = HuggingFaceBgeEmbeddings(
                model_name=model_name,
                model_kwargs=model_kwargs,
                encode_kwargs=encode_kwargs,
                query_instruction="Generate vector representations of text for text retrieval"
            )

In [3]:
doc_db_save_dir = './model_save/vector'

if not os.path.exists(doc_db_save_dir):

    # 1. Load local dataset from file
    loader = TextLoader("./data/Kuaizhou-1A.txt")
    documents = loader.load()

    # 2. Split documents
    text_splitter = CharacterTextSplitter(chunk_size=96, chunk_overlap=8)
    splited_documents = text_splitter.split_documents(documents)

    # 3. Vectorize and save to local directory

    db = Chroma.from_documents(splited_documents, embedding, persist_directory=doc_db_save_dir)
    db.persist()
else:
    db = Chroma(persist_directory=doc_db_save_dir,  embedding_function=embedding)

Created a chunk of size 261, which is longer than the specified 96
Created a chunk of size 963, which is longer than the specified 96
Created a chunk of size 551, which is longer than the specified 96
Created a chunk of size 499, which is longer than the specified 96
Created a chunk of size 104, which is longer than the specified 96


# Load the dialogue model and construct the dialogue prompt

In [4]:
model_id = './model_save/dpo/'

model = PhiForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

phi_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device=device)

In [5]:
question = "What is the near-Earth orbit payload capacity of Kuaizhou-1A?"

In [6]:
# Construct the prompt
template = "Please answer the following question based on the given background knowledge. If you do not know the information, directly answer 'No relevant answer found'.\nBelow is the background knowledge:\n"

similar_docs = db.similarity_search(question, k = 1)
for i, doc in enumerate(similar_docs):
    template += f"{i}. {doc.page_content}"

template += f'\nBelow is the question:\n{question}'
print(template)

Please answer the following question based on the given background knowledge. If you do not know the information, directly answer 'No relevant answer found'.
Below is the background knowledge:
0. Kuaizhou-1A:
Kuaizhou-1A (English: Kuaizhou-1A, abbreviated: KZ-1A) is a three-stage solid-fuel rocket developed by the China Aerospace Science and Industry Corporation Rocket Technology Company.
The Kuaizhou-1A rocket is about 20 meters long, weighs about 30 tons at lift-off, has a maximum fairing diameter of 1.4 meters, a payload capacity of 200 kilograms to sun-synchronous circular orbit at 700 kilometers, and a near-Earth orbit payload capacity of 300 kilograms. The rocket uses a vehicle-mounted mobile launch method, mainly targeting micro-satellite launches and networking, and is capable of launching multiple satellites with one rocket.
On January 11, 2024, at 11:52, China successfully launched the Tianxing-1 No. 02 satellite into space using the Kuaizhou-1A rocket from the Jiuquan Satell

In [7]:
prompt = f"##Question:\n{template}\n##Answer:\n"
outputs = phi_pipe(prompt, num_return_sequences=1, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id)

print(outputs[0]['generated_text'][len(prompt): ])

The near-Earth orbit payload capacity of Kuaizhou-1A is 300 kilograms.
