``` 内容来自B站视频内容 《B站首推！2025最新版大模型RAG入门到精通实战教程！手把手带你结合企业级项目实战完成一套完整的RAG项目！增加检索/文本向量/知识库搭建》```
# 文档的加载与切割 

In [None]:
!pip install openai

In [None]:
# 安装 pdf 解析库
!pip install pdfminer.six 

In [None]:
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

In [None]:
def extract_text_from_pdf(filename,page_numbers=None,min_line_length=1):
    ''' 从PDF文件中(指定页码) 提取文字 '''
    paragraphs = []
    buffer = ''
    full_text = ''
    # 提取全部文本
    for i, page_layout in enumerate(extract_pages(filename)):
        # 如果指定了页码范围，跳过范围外的页码
        if page_numbers is not None and i not in page_numbers:
            continue
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                full_text += element.get_text() + '\n'
    # 按空行分隔，将文本重新组织成段落
    lines = full_text.split('\n')
    for text in lines:
        if len(text) >= min_line_length:
            buffer += (' '+text) if not text.endswith(' ') else text.strip('-')
        elif buffer:
            paragraphs.append(buffer)
            buffer = ''
    if buffer:
        paragraphs.append(buffer)
    return paragraphs

In [None]:
paragraphs = extract_text_from_pdf("llama2.pdf",min_line_length=10)
for para in paragraphs[:4]:
    print(para + "\n")

# LLM接口封装

In [None]:
!pip install -U python-dotenv

In [None]:
from openai import OpenAI
import os

# 加载环境变量
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv(),verbose=True) 

client = OpenAI()

In [None]:
def get_completion(promt, model="gpt-3.5-turbo"):
    ''' 封装 openAI 接口 '''
    messages = [{"role": "user", "content": promt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # 模型输出结果的随机性，0-1之间，值越大，输出结果越随机
    )
    return response.choices[0].message.content


# 构建Prompt 模板

In [None]:
def build_prompt(promt_template, **kwargs):
    ''' 将prompt模板和参数拼接起来 '''
    inputs = {}
    for k,v in kwargs.items():
        if isinstance(v, list) and all(isinstance(i, str) for i in v):
            val = '\n\n'.join(v)
        else:
            val = v
        inputs[k] = val
    return promt_template.format(**inputs)  


prompt_template = """
你是一个问答机器人。
你的任务是根据下述给定的已知信息回答用户问题。
已知信息：
{context}
用户问：
{query}

如果已知信息不包含用户问题的答案，请直接说"不知道"。
请用中文回答。
"""

# 向量间的相似度计算

In [None]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [None]:
def cos_sim(a,b):
    return dot(a, b)/(norm(a)*norm(b))

def l2(a,b):
    x = np.asarray(a)-np.asarray(b)
    return norm(x)

In [None]:
# 将文本转换为向量
def get_embeddings(texts,model='text-embedding-ada-002',dimensions=None):
    '''封装OpenAI的embedding模型接口，返回embedding列表和embedding向量维度'''
    if model == 'text-embedding-ada-002':
        dimensions = None
    if dimensions:
        data = client.embeddings.create(input=texts,model=model,dimensions=dimensions).data
    else:
        data = client.embeddings.create(input=texts,model=model).data
    print(data)
    return [x.embedding for x in data]

In [None]:
test_query=["测试文本"]
vec = get_embeddings(test_query)[0]
print(f"Total dimension: {len(vec)}")
print(f"First 10 dimensions: {vec[:10]}")

In [None]:
query = "国际争端"
documents = [
    "联合国就苏丹达尔富尔地区大规模暴力事件发出警告",
    "土耳其、芬兰、瑞典与北约代表将继续就瑞典“入约”问题进行谈判",
    "日本岐阜市陆上自卫队射击场内发生枪击事件 3人受伤",
    "国家游泳中心（水立方）：恢复游泳、嬉水乐园等水上项目运营",
    "我国首次在空间站开展舱外辐射生物学暴露实验",
]

query_vec = get_embeddings(query)[0]
doc_vecs = get_embeddings(documents)

print("Cosine distance:") #越大越相似
print(cos_sim(query_vec,query_vec))
for vec in doc_vecs:
    print(cos_sim(query_vec,vec))

print("\nEuclidean distance:") #越小越相似
print(l2(query_vec,query_vec))
for vec in doc_vecs:
    print(l2(query_vec,vec))

# chroma 向量数据库

In [None]:
!pip install chromadb

In [None]:
paragraphs = extract_text_from_pdf("llama2.pdf",page_numbers=[2,3],min_line_length=10)

In [None]:
import chromadb
from chromadb.config import Settings
class MyVectorDBConnector:
    def __init__(self,collection_name,embedding_fn):
        #内存模式
        chroma_client = chromadb.Client(Settings(allow_reset=True))
        #数据持久化
        #chroma_client = chromadb.PersistentClient(path="./chroma_db")
        # 清空数据库
        chroma_client.reset()
        self.collection = chroma_client.get_or_create_collection(name=collection_name)
        self.embedding_fn = embedding_fn
    def add_documents(self,documents):
        ''' 向 collection 中添加 documents '''
        self.collection.add(
            documents=documents,
            ids=[f"id{i}" for i in range(len(documents))],
            embeddings= self.embedding_fn(documents)
        )
    def search(self,query,top_n):
        ''' 检索向量数据库 '''
        results = self.collection.query(
            query_embeddings=self.embedding_fn([query]),
            n_results=top_n
        )   
        return results

In [None]:
vector_db = MyVectorDBConnector("demo",get_embeddings)
vector_db.add_documents(paragraphs)
user_query = "Llama 2有多少个参数"
results = vector_db.search(user_query,2)
for para in results['documents'][0]:
    print(para,"\n")
    


# 基于向量检索的RAG

In [None]:
class RAG_Bot:
    def __init__(self,vector_db,llm_api,n_results=2):
        self.vector_db = vector_db
        self.llm_api = llm_api
        self.n_results = n_results
    def chat(self,user_query):
        # 1. 检索
        search_results = self.vector_db.search(user_query,self.n_results)
        # 2. 构建Promt
        prompt = build_prompt(prompt_template,context=search_results['documents'][0],query=user_query)
        # 3. 调用LLM
        response = self.llm_api(prompt)
        return response

In [None]:
# 创建一个RAG机器人
bot = RAG_Bot(vector_db,llm_api=get_completion)
user_query = "Llama 2有多少个参数"
response = bot.chat(user_query)
print(response)