In [1]:
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA #构建对话系统
from langchain.embeddings import HuggingFaceEmbeddings #矢量化
from langchain.vectorstores import Chroma #Chroma 向量数据库
from time import time

In [2]:
# 定義 Document 類型
class Document:
    
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f"Document(page_content={self.page_content}, metadata={self.metadata})"

In [18]:
import os

# 設定文件資料夾路徑
folder_path = "faq_txt"  # 包含多個處理過的保單資料的文件夾

result = []

# 遍歷所有文件
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)

        # 讀取每個文件內容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 進行分割
        sections = content.split("\n\n")

        # 將每個段落嵌入並存入result列表
        for i, section in enumerate(sections, start=1):
            if section:
                # 包含文檔來源和頁碼信息
                result.append(Document(page_content=section, metadata={"source": filename, "page": i}))



Document(page_content=網路銀行買/賣外幣的累計交易額度限制是多少?,等值新臺幣五十萬元。, metadata={'source': 'faq101.txt', 'page': 3})


In [4]:
len(result)

1624

In [5]:
# 加载embedings 向量模型
# https://huggingface.co/DMetaSoul/Dmeta-embedding-zh
model_name = "DMetaSoul/Dmeta-embedding-zh"
model_kwargs = {'device': 'cuda'}
embedding = HuggingFaceEmbeddings(model_name=model_name,model_kwargs=model_kwargs)

  embedding = HuggingFaceEmbeddings(model_name=model_name,model_kwargs=model_kwargs)
  from tqdm.autonotebook import tqdm, trange


In [6]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.5.0+cu124
True


In [7]:
model_name1 = "TencentBAC/Conan-embedding-v1"
model_kwargs1 = {'device': 'cuda'}
embedding1 = HuggingFaceEmbeddings(model_name=model_name1,model_kwargs=model_kwargs1)

In [8]:
#灌库
vectordb = Chroma.from_documents(documents=result, embedding=embedding, persist_directory="faq_Dmeta2")

In [90]:
# 指定只能找最近的2個搜尋結果 單一問題召回結果
#retriever = vectordb.similarity_search_with_score(search_kwargs={'k': 10})
docs2 = vectordb.similarity_search_with_score(query='誰可以申辦玉山e指信貸',k=1) # 找出與問題最相關的解答(餘弦計算，score越低=越相似)
d = []
for i in range(len(docs2)):
    
    # 從元組中提取 Document 物件
    document = docs2[i][0]
    # 從 metadata 中獲取 source
    source = document.metadata['source']
    # 使用字串處理，移除 'faq' 和 '.txt'
    faq_number = source.replace('faq', '').replace('.txt', '')
    d.append(faq_number)
    print(f'source:{faq_number}')  # 輸出: 414

#print(d)
for i in range(len(docs2)):
    print(docs2[i])

# import json
# with open('questions_example.json','r',encoding = 'utf-8') as f:
#     f1 = json.load(f)
# with open('ground_truths_example.json','r',encoding = 'utf-8') as f2:
#     f3 = json.load(f2)
# c = 0
# for i in range(101,151):
#     d = []
#     qu = ''
#     for j in f1['questions']:
#         if j['qid'] == i:
#             qu = qu +str (j['query'])
#             docs2 = vectordb.similarity_search_with_score(query= str(j['query']) ,k=5) 
#             document = docs2[0][0]
#             # 從 metadata 中獲取 source
#             source = document.metadata['source']
#             # 使用字串處理，移除 'faq' 和 '.txt'
#             faq_number = source.replace('faq', '').replace('.txt', '')
#             d.append(faq_number)
#             #print(d[0])
#             #print(faq_number)
#     for k in f3["ground_truths"]:
#         if k['qid'] == i:
#             n = k['retrieve']
#     #print(n)
#     if str(n) == d[0]:
#         c += 1
#     else:
#         print(f"qid:{i},question{qu},Correct:{n},Right:{d[0]},")
# print(c)  
           
    

source:138
(Document(metadata={'page': 1, 'source': 'faq138.txt'}, page_content='如何申請玉山e指信貸?,可以透過以下方式申請本行個人信用貸款：\\n\\n-點選本行官網/行動銀行「信貸線上申請」服務，可快速、簡便完成貸款申請，免跑分行。\\n-點選本行官網專人與我聯絡(https://www.esunbank.com/zh-tw/personal/loan/tools/apply/personal-credit-loan)，留下聯繫資料，由專人為您服務。\\n-使用本行行動銀行APP中「貸款專區」，進行簡便、快速的線上申請。\\n-至本行全省各分行洽詢，由專人為您服務。。'), 0.3171129822731018)


In [91]:
#查看RAG準確率
import json
with open('questions_example.json','r',encoding = 'utf-8') as f:
    f1 = json.load(f)
with open('ground_truths_example.json','r',encoding = 'utf-8') as f2:
    f3 = json.load(f2)
c = 0
for i in range(101,151):
    d = []
    qu = ''
    for j in f1['questions']:
        if j['qid'] == i:
            qu = qu +str (j['query'])
            docs2 = vectordb.similarity_search_with_score(query= str(j['query']) ,k=5) 
            document = docs2[0][0]
            # 從 metadata 中獲取 source
            source = document.metadata['source']
            # 使用字串處理，移除 'faq' 和 '.txt'
            faq_number = source.replace('faq', '').replace('.txt', '')
            d.append(faq_number)
            #print(d[0])
            #print(faq_number)
    for k in f3["ground_truths"]:
        if k['qid'] == i:
            n = k['retrieve']
    #print(n)
    if str(n) == d[0]:
        c += 1
    else:
        print(f"qid:{i},question{qu},Correct:{n},Right:{d[0]},")
print(c)  

qid:111,question無卡提款服務是否可在ATM 機器上開通？,Correct:76,Right:149,
qid:121,question有哪些方法可以手動檢查WebATM元件的安裝狀態？,Correct:414,Right:448,
qid:135,question誰可以申辦玉山e指信貸,Correct:28,Right:138,
qid:138,question要怎麼開通簡訊密碼?,Correct:339,Right:463,
46


In [105]:

# import json
# with open('questions_example.json','r',encoding = 'utf-8') as f:
#     f1 = json.load(f)
# with open('ground_truths_example.json','r',encoding = 'utf-8') as f2:
#     f3 = json.load(f2)
# c = 0
# for i in range(101,151):
#     d = []
#     qu = ''
#     for j in f1['questions']:
#         if j['qid'] == i:
#             qu = qu +str (j['query'])
#             docs2 = vectordb.similarity_search_with_score(query= str(j['query']) ,k=5) 
#             document = docs2[0][0]
#             # 從 metadata 中獲取 source
#             source = document.metadata['source']
#             # 使用字串處理，移除 'faq' 和 '.txt'
#             faq_number = source.replace('faq', '').replace('.txt', '')
#             d.append(faq_number)
#             #print(d[0])
#             #print(faq_number)
#     for k in f3["ground_truths"]:
#         if k['qid'] == i:
#             n = k['retrieve']
#     #print(n)
#     if str(n) == d[0]:
#         c += 1
#     else:
#         print(f"qid:{i},question{qu},Correct:{n},Right:{d[0]},")
# print(c) 
data_json = {"ground_truths":[{"qid":i,"retrieve":"",'category':""}for i in range(1,11)]}
for m in range(0,10):
    if 2 >= m>= 0:
        k = "insurance"
    elif 6>=m >=3:
        k = 'fiance'
    else:
        k = 'faq'
    data_json['ground_truths'][m]["category"] = k
print(type(data_json))
print(data_json)
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(data_json, f, ensure_ascii=False, indent=4)
    print('write ok')

<class 'dict'>
{'ground_truths': [{'qid': 1, 'retrieve': '', 'retrive': 'insurance'}, {'qid': 2, 'retrieve': '', 'retrive': 'insurance'}, {'qid': 3, 'retrieve': '', 'retrive': 'insurance'}, {'qid': 4, 'retrieve': '', 'retrive': 'fiance'}, {'qid': 5, 'retrieve': '', 'retrive': 'fiance'}, {'qid': 6, 'retrieve': '', 'retrive': 'fiance'}, {'qid': 7, 'retrieve': '', 'retrive': 'fiance'}, {'qid': 8, 'retrieve': '', 'retrive': 'faq'}, {'qid': 9, 'retrieve': '', 'retrive': 'faq'}, {'qid': 10, 'retrieve': '', 'retrive': 'faq'}]}
write ok


In [10]:
# #灌库
# vectordb2 = Chroma.from_documents(documents=result, embedding=embedding1, persist_directory="faq_Dmeta2")

In [68]:
# # 指定只能找最近的2個搜尋結果
# #retriever = vectordb.similarity_search_with_score(search_kwargs={'k': 10})
# docs2 = vectordb2.similarity_search_with_score(query='如何檢查WebATM元件的安裝狀態是否成功？',k=5) 

# for i in range(len(docs2)):
#     print(docs2[i])

NameError: name 'vectordb2' is not defined