In [1]:
from typing import List

def split_into_chunks(doc_file: str) -> List[str]:
    with open(doc_file, 'r') as file:
        content = file.read()
    return [chunk for chunk in content.split(".")]

# 主程序部分
chunks = split_into_chunks("valid.txt")

for i, chunk in enumerate(chunks[:5]):
    print(f"[{i}] {chunk}\n")

[0]  consumers may want to move their telephones a little closer to the tv set 
 <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> 
 two weeks ago viewers of several nbc <unk> consumer segments started calling a N number for advice on various <unk> issues 
 and the new syndicated reality show hard copy records viewers ' opinions for possible airing on the next day 's show 
 interactive telephone technology has taken a new leap in <unk> and television programmers are racing to exploit the possibilities 
 eventually viewers may grow <unk> with the technology and <unk> the cost 
 but right now programmers are figuring that viewers who are busy dialing up a range of services may put down their <unk> control <unk> and stay <unk> 
 we 've been spending a lot of time in los angeles talking to tv production people says mike parks president of call interactive which supplied technology for both abc sp

In [2]:
from sentence_transformers import SentenceTransformer
from typing import List

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunk(chunk: str) -> List[float]:
    embedding = embedding_model.encode(chunk)
    return embedding.tolist()

test_embedding = embed_chunk("test")
print(len(test_embedding))
print(test_embedding)


384
[0.011573436670005322, 0.025136202573776245, -0.03670184686779976, 0.05932488292455673, -0.0071490490809082985, -0.04119422286748886, 0.0770873948931694, 0.03744256868958473, 0.01244899071753025, -0.006117628887295723, 0.017034275457262993, -0.07701538503170013, -0.00039416426443494856, 0.027909062802791595, -0.015989158302545547, -0.06827527284622192, 0.008884645998477936, -0.020280703902244568, -0.08035995811223984, -0.013074046932160854, -0.04110001400113106, -0.025898080319166183, -0.0265386700630188, 0.03305228799581528, -0.022079195827245712, 0.021046103909611702, -0.05792200192809105, 0.03294876217842102, 0.02970738522708416, -0.06224840506911278, 0.038788024336099625, 0.03199068829417229, 0.015330815687775612, 0.0453069731593132, 0.05314944311976433, 0.013360676355659962, 0.041224926710128784, 0.028142910450696945, 0.019398434087634087, -0.0032523232512176037, -0.0036123408935964108, -0.14286024868488312, 0.0380711704492569, -0.010916205123066902, 0.02609400637447834, 0.041

In [3]:
embeddings = [embed_chunk(chunk) for chunk in chunks]
print(len(embeddings))
print(embeddings[0])

1256
[-0.07880879193544388, -0.027121834456920624, 0.0015604799846187234, -0.10914729535579681, -0.029900383204221725, 0.041020531207323074, 0.07653851807117462, 0.071550652384758, 0.07218211889266968, -0.012317556887865067, -0.11345069855451584, 0.04731909930706024, -0.005436739884316921, -0.020033372566103935, 0.015752729028463364, -0.1001499593257904, 0.099742092192173, -0.09876429289579391, -0.06106129288673401, 0.010303838178515434, 0.034038055688142776, -0.00808548741042614, -0.039469193667173386, -0.036516621708869934, 0.014806985855102539, -0.008767597377300262, -0.0428491048514843, 0.0014583187876269221, 0.051867999136447906, -0.049174726009368896, 0.02268074080348015, 0.1050114780664444, 0.12340041249990463, 0.011990927159786224, -0.0813404768705368, -0.022908460348844528, -0.031883250921964645, 0.01953643560409546, -0.04718536138534546, 0.056464143097400665, -0.012046467512845993, -0.07143338024616241, -0.06021212041378021, 0.006734916474670172, -0.009698360227048397, -0.044

In [4]:
import chromadb

chromadb_client = chromadb.EphemeralClient()
chromadb_collection = chromadb_client.get_or_create_collection(name="default")

def save_embeddings(chunks:List[str], embeddings:List[List[float]]) -> None:
    ids = [str(i) for i in range(len(chunks))]
    chromadb_collection.add(
        documents=chunks,
        embeddings=embeddings,
        ids=ids
    )

save_embeddings(chunks,embeddings)

In [5]:
def retrieve(query: str, top_k: int) -> List[str]:
    query_embedding = embed_chunk(query)
    results = chromadb_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "distances"]  # 建议带上距离，方便观察
    )
    return results['documents'][0]

# 测试
query = "Why are audiences getting tired of interactive telephone technology?"
retrieved_chunks = retrieve(query, 5)

for i, chunk in enumerate(retrieved_chunks):
    print(f"[{i}] {chunk}\n")


[0]  consumers may want to move their telephones a little closer to the tv set 
 <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> 
 two weeks ago viewers of several nbc <unk> consumer segments started calling a N number for advice on various <unk> issues 
 and the new syndicated reality show hard copy records viewers ' opinions for possible airing on the next day 's show 
 interactive telephone technology has taken a new leap in <unk> and television programmers are racing to exploit the possibilities 
 eventually viewers may grow <unk> with the technology and <unk> the cost 
 but right now programmers are figuring that viewers who are busy dialing up a range of services may put down their <unk> control <unk> and stay <unk> 
 we 've been spending a lot of time in los angeles talking to tv production people says mike parks president of call interactive which supplied technology for both abc sp

In [6]:
from sentence_transformers import CrossEncoder

def rerank(query:str, retrieved_chunks:List[str],top_k:int) -> List[str]:
    cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')
    pairs=[(query,chunk) for chunk in retrieved_chunks]
    scores=cross_encoder.predict(pairs)

    chunk_with_score_list=[(chunk,score) 
                           for chunk,score in zip(retrieved_chunks, scores)]
    chunk_with_score_list.sort(key=lambda pair:pair[1],reverse=True)
    return [chunk for chunk,_ in chunk_with_score_list][:top_k]

reranked_chunks = rerank(query, retrieved_chunks,3)

for i,chunk in enumerate(reranked_chunks):
    print(f"[{i}]{chunk}\n")

[0] consumers may want to move their telephones a little closer to the tv set 
 <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> 
 two weeks ago viewers of several nbc <unk> consumer segments started calling a N number for advice on various <unk> issues 
 and the new syndicated reality show hard copy records viewers ' opinions for possible airing on the next day 's show 
 interactive telephone technology has taken a new leap in <unk> and television programmers are racing to exploit the possibilities 
 eventually viewers may grow <unk> with the technology and <unk> the cost 
 but right now programmers are figuring that viewers who are busy dialing up a range of services may put down their <unk> control <unk> and stay <unk> 
 we 've been spending a lot of time in los angeles talking to tv production people says mike parks president of call interactive which supplied technology for both abc spo

In [7]:
from dotenv import load_dotenv
from google import genai


load_dotenv()
google_client = genai.Client()

def generate(query:str, chunks:List[str]) ->str:
    prompt = f"""You're a knowledge assistant,please answer questions according to the user's request and the following information. 
    the user's question:{query}
    relevant information:
    {"\n\n".join(chunks)}
    Please answer according to the information mentioned above.Do not make up information."""

    print(f"{prompt}\n\n---\n")

    response = google_client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )
    return response.text

answer = generate(query,reranked_chunks)
print(answer)


You're a knowledge assistant,please answer questions according to the user's request and the following information. 
    the user's question:Why are audiences getting tired of interactive telephone technology?
    relevant information:
     consumers may want to move their telephones a little closer to the tv set 
 <unk> <unk> watching abc 's monday night football can now vote during <unk> for the greatest play in N years from among four or five <unk> <unk> 
 two weeks ago viewers of several nbc <unk> consumer segments started calling a N number for advice on various <unk> issues 
 and the new syndicated reality show hard copy records viewers ' opinions for possible airing on the next day 's show 
 interactive telephone technology has taken a new leap in <unk> and television programmers are racing to exploit the possibilities 
 eventually viewers may grow <unk> with the technology and <unk> the cost 
 but right now programmers are figuring that viewers who are busy dialing up a range o