In [18]:
from langchain.document_loaders import DirectoryLoader
import re
import os
import openai
import pinecone

openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone.api_key= os.getenv("PINECONE_API_KEY")

# 定义文档存储的目录

directory = r'C:\Users\zhuyu\OneDrive\桌面\data'

def load_docs(directory):
    """
    加载指定目录中的所有文档。
    使用DirectoryLoader从指定的目录加载文档，这些文档被预期为具有可读文本格式。
    
    参数:
    directory (str): 包含文档的目录的路径。
    
    返回:
    list: 包含加载的文档对象的列表。
    """
    
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents

documents = load_docs(directory)

def clean_text(doc):
    """
    清洗文档文本，去除不必要的字符和格式。
    移除文档中的引用标记、多余的换行符和制表符，以便于后续处理。
    
    参数:
    doc (Document): 需要清洗的文档对象，预期具有page_content属性包含文本。
    
    返回:
    str: 清洗后的文本字符串。
    """
        
    # 从文档对象获取page_content属性
    text = doc.page_content
    text = re.sub(r'\[.*?\]', '', text)  # 删除所有括号内的引用标记
    text = re.sub(r'\n+', '\n', text)  # 替换多个换行符为单个换行符
    text = re.sub(r'\n+', '\n', text)  # 再次替换多个换行符为单个换行符
    text = re.sub(r'[\t\[\]]', '', text)  # 移除制表符和特定括号
    return text

# 加载并清洗目录下的所有文档
documents = load_docs(directory)
cleaned_documents = [clean_text(doc) for doc in documents]

print(cleaned_documents)

['Sam Altman\n9 min listen\n0:00\n9:20\nArticle\nTalk\nRead\nEdit\nView history\nTools From Wikipedia, the free encyclopedia Sam Altman Sam Altman CropEdit James Tamim.jpg Altman in 2019 BornApril 22, 1985 (age 38) Chicago, Illinois, U.S. EducationStanford University (dropped out) OccupationEntrepreneur Known forLoopt, Y Combinator, OpenAI TitleCEO of OpenAI LP WebsiteOfficial website Edit this at Wikidata Samuel Harris Altman (/ˈɔːltmən/ AWLT-mən; born 1985) is an American entrepreneur, investor, and programmer. He was the co-founder of Loopt and is the current CEO of OpenAI.  He was the president of Y Combinator and was briefly the CEO of Reddit.\nEarly life and education Altman was born into a Jewish family, and grew up in St. Louis, Missouri. His mother is a dermatologist. He received his first computer at the age of eight.  He attended John Burroughs School. In 2005, after one year at Stanford University studying computer science, he dropped out without earning a bachelors degree.

In [19]:
def split_docs(texts, chunk_size=600, chunk_overlap=30):
    """
    将长文本分割成指定大小的较小文本片段，允许片段之间有重叠。
    
    参数:
    texts (list of str): 需要分割的文本列表。
    chunk_size (int): 每个文本片段的字符数。
    chunk_overlap (int): 相邻文本片段之间的重叠字符数。
    
    返回:
    list of str: 分割后的文本片段列表。
    """
    
    split_docs = []
    for text in texts:
        start = 0
        while start < len(text):
            # 提取从start到start+chunk_size的文本片段
            end = min(start + chunk_size, len(text))
            split_docs.append(text[start:end])
            # 更新起始点，移动窗口，考虑重叠部分
            start += (chunk_size - chunk_overlap)  # 移动窗口，考虑重叠
    return split_docs

# 应用函数，将清洗后的文档分割成较小的片段
docs = split_docs(cleaned_documents)
print(len(docs))# 打印分割后的文档片段数量

12


In [20]:
#requires for open ai embedding
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')
tokenizer = tiktoken.get_encoding('cl100k_base')
# import openai
# from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'
embeddings = OpenAIEmbeddings(
    model=model_name,
)

In [21]:
import pinecone 
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec, Index
# 初始化 pinecone
import os
from pinecone import Pinecone, ServerlessSpec, Index

#加载API

pc = Pinecone()

index_name = "langchain-chatbot-zyb"

#查看索引是否存在， 如若不存在则创造一个新索引
if index_name not in pc.list_indexes().names():
    pc.create_index(
    name=index_name, 
    dimension=1536, 
    metric='euclidean',
    spec=ServerlessSpec(
    cloud='aws',
    region='us-west-2')
    )
    
index = pc.Index(index_name)

In [22]:
# 生成每个文档的嵌入向量
embedding_list = []
for i in range(0, len(docs)):
    embedding_list.append(embeddings.embed_query(docs[i]))
    
def insert_vectors_to_pinecone(docs, embedding_list):
    """
    将文档嵌入向量插入 Pinecone 索引。

    参数：
        docs (list): 包含文档内容的列表。
        embedding_list (list): 包含文档嵌入向量的列表。

    返回：
        无返回值，直接将向量插入 Pinecone 索引。
    """
     # 遍历每个文档及其对应的嵌入向量
    for i, (doc, vec) in enumerate(zip(docs, embedding_list)):
        # 使用带有文档文本的元数据插入向量
        index.upsert(vectors=[(f"vec{i+1}", vec, {"text": doc})])

insert_vectors_to_pinecone(docs, embedding_list)

In [23]:
def get_similar_docs(query_vector, k=1, include_metadata=True):
    # 执行查询
    """
    查询与给定向量最相似的文档。

    参数：
        query_vector (list): 查询向量。
        k (int): 要返回的最相似文档的数量，默认为 1。
        include_metadata (bool): 是否包含文档的元数据，默认为 True。

    返回：
        dict: 包含查询结果的字典，其中包括匹配的文档ID、匹配分数和匹配的文本内容。
    """
    response = index.query(
        vector=query_vector,  # 需要传入查询向量
        top_k=k,
        include_metadata=include_metadata
    )
    return response

# 测试
query = "sam altman title"
query_vector = embeddings.embed_query(query)  # 使用嵌入模型生成查询向量
similar_docs = get_similar_docs(query_vector, k=2)  # 获取最相似的3个文档

# 打印出查询结果中的文本
for match in similar_docs['matches']:
    print("Match ID:", match['id'])
    print("Match Score:", match['score'])
    if 'metadata' in match and 'text' in match['metadata']:
        print("Matched Text:", match['metadata']['text'])
    else:
        print("No text metadata available for this match.")
    print()  # 打印空行以便于阅读

Match ID: vec1
Match Score: 0.318327665
Matched Text: Sam Altman
9 min listen
0:00
9:20
Article
Talk
Read
Edit
View history
Tools From Wikipedia, the free encyclopedia Sam Altman Sam Altman CropEdit James Tamim.jpg Altman in 2019 BornApril 22, 1985 (age 38) Chicago, Illinois, U.S. EducationStanford University (dropped out) OccupationEntrepreneur Known forLoopt, Y Combinator, OpenAI TitleCEO of OpenAI LP WebsiteOfficial website Edit this at Wikidata Samuel Harris Altman (/ˈɔːltmən/ AWLT-mən; born 1985) is an American entrepreneur, investor, and programmer. He was the co-founder of Loopt and is the current CEO of OpenAI.  He was the president of Y 

Match ID: vec5
Match Score: 0.41213274
Matched Text: research on basic income, the future of computing, education, and building new cities. 
In March 2019, YC announced Altman's transition from president of the company to a less hands-on role Chairman of the Board, in order for him to focus on OpenAI.  This decision came shortly after YC anno