# 安裝套件


In [None]:
# langchain 相關套件
!pip install -qU langchain-core
!pip install -qU langchain-community
!pip install -qU langchain-text-splitters
!pip install -qU langchain-qdrant # Qdrant 向量資料庫
!pip install -qU langchain-voyageai # voyageai 嵌入模型，目前中文支援度最高，要註冊 API key，免費但有限額
!pip install -qU langchain-huggingface # huggingface
!pip install -qU sentence_transformers # 透過 sentence_transformers 下載 huggingface 模型
!pip install -qU langchain_aws


In [None]:
# 嵌入模型
!pip install -qU FlagEmbedding # BGE-M3，BAAI和中國科學技術大學，是BAAI開源的模型，中文支援第三，支援密集檢索（Dense retrieval）、詞彙（稀疏）檢索（Lexical/Sparse retrieval）
!pip install -qU fastembed # Qdrant 輕量嵌入模型，僅密集檢索（Dense retrieval）
!pip install -qU langchain-google-genai

In [None]:
# 其他套件
!pip install -qU unstructured[all-docs]
!pip install -qU pypdf
!pip install -qU pymupdf
!pip install -qU pydantic
!pip install -qU lxml
!pip install -qU pillow
!pip install -qU Pillow
!pip install -qU pytesseract
!pip install -qU rapidocr-onnxruntime
!pip install -qU matplotlib
!pip install -qU tiktoken
!pip install -qU tqdm

# 模型與變數宣告

## 1.　相關變數宣告


In [None]:
import os
### Qdrant ###

qdrant_url = "..."
qdrant_api_key = "..."
# qdrant_collection_name_baai_bgem3 = "insurance_collection_baai_bgem3"
qdrant_collection_name_baai_bgem3 = "sa_collection_baai_bgem3"
# qdrant_collection_name_microsoft_multilingual_e5_large = "insurance_collection_microsoft_multilingual_e5_large"
qdrant_collection_name_microsoft_multilingual_e5_large = "sa_collection_microsoft_multilingual_e5_large"
# qdrant_collection_name_cohere_multilingual_v3 = "insurance_collection_cohere_multilingual_v3"
qdrant_collection_name_cohere_multilingual_v3 = "sa_collection_cohere_multilingual_v3"
# qdrant_collection_name_gemini_exp_03_07 = "insurance_collection_gemini_exp_03_07"
qdrant_collection_name_gemini_exp_03_07 = "sa_collection_gemini_exp_03_07"


### 其它變數 ###
# 密集向量維度，BGE-M3、multilingual_e5_large、cohere_multilingual_v3 的 dense dimension 是 1024
dense_embeddings_dim_1024 = 1024
dense_embeddings_dim_3072 = 3072

### AWS 變數 ###
os.environ["AWS_ACCESS_KEY_ID"] = "..."
os.environ["AWS_SECRET_ACCESS_KEY"] = "..."
os.environ["AWS_DEFAULT_REGION"]="..."

### Google 變數 ###
os.environ["GOOGLE_API_KEY"]= "..."



# import os
# from langchain_voyageai import VoyageAIEmbeddings
# from langchain_qdrant import FastEmbedSparse
#
# voyage_api_key = "pa-YkqrG0XRK-Pku1MtBDR0kmmNfNPAcmWp2wyZGpRROjp"
# os.environ["VOYAGE_API_KEY"] = voyage_api_key
#
# dense_embeddings=VoyageAIEmbeddings(model="voyage-multilingual-2", output_dimension=1024)
# sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

## 2.　Embedding 模型宣告與定義


In [None]:
from langchain.embeddings.base import Embeddings
from langchain_qdrant.sparse_embeddings import SparseEmbeddings
from langchain_qdrant.sparse_embeddings import SparseVector
from FlagEmbedding import BGEM3FlagModel

# BGE-M3 模型變數宣告
# 參考 https://huggingface.co/BAAI/bge-m3
bge_m3_embedding = BGEM3FlagModel('BAAI/bge-m3', use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

# 定義密集向量嵌入 class
class BGEQdrantDenseEmbeddings(Embeddings):  # 繼承 langchain.embeddings.base.Embeddings
    # 初始化
    def __init__(self):
        self.model = bge_m3_embedding
    # 查詢嵌入
    def embed_documents(self, texts):
        return self.model.encode(texts, return_dense=True, return_sparse=False)["dense_vecs"]
    # 文件嵌入
    def embed_query(self, query):
        return self.model.encode([query], return_dense=True, return_sparse=False)["dense_vecs"][0]


# 定義稀疏向量嵌入 class
class BGEQdrantSparseEmbeddings(SparseEmbeddings):  # 繼承 langchain_qdrant.sparse_embeddings.SparseEmbeddings
    # 初始化
    def __init__(self):
        self.model = bge_m3_embedding
    # 文件嵌入
    def embed_documents(self, texts):
        sparse_embeddings=self.model.encode(texts, return_dense=False, return_sparse=True)["lexical_weights"]
        return [
            SparseVector(
                indices=list(map(lambda x: int(x), dict(default_dict).keys())),
                values=list(dict(default_dict).values())
            )
            for default_dict in sparse_embeddings
        ]
    # 查詢嵌入
    def embed_query(self, query):
        sparse_embeddings = self.model.encode([query], return_dense=False, return_sparse=True)["lexical_weights"]
        return [
            SparseVector(
                indices=list(map(lambda x: int(x), dict(default_dict).keys())),
                values=list(dict(default_dict).values())
            )
            for default_dict in sparse_embeddings
        ][0]

In [None]:
### 模型變數 ###
dense_embeddings = BGEQdrantDenseEmbeddings()  # 密集向量模型變數
sparse_embeddings = BGEQdrantSparseEmbeddings()  # 稀疏向量模型變數

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

embeddings_multilingual_e5_large = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

In [None]:
from langchain_aws import BedrockEmbeddings

import boto3
import certifi
client = boto3.client(
    "bedrock-runtime",
    verify=certifi.where()  # 显式指定证书
)

embeddings_aws_bedrock_choere = BedrockEmbeddings(client=client, model_id="cohere.embed-multilingual-v3")

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings_gemini_exp_03_07 = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-exp-03-07")

# 向量資料庫 collection 建立

## 1.　Qdrant collection 建立

### 1.1 BAAI BGE-M3

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams


# 參考
# https://qdrant.tech/documentation/concepts/collections/
#
# Qdrant 不用定義 schema，欄位都是動態

# Qdrant client
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key
)

# 檢查 collection 是否存在，存在則刪掉
is_col_exist = qdrant_client.collection_exists(collection_name=qdrant_collection_name_baai_bgem3)
if is_col_exist:
    qdrant_client.delete_collection(collection_name=qdrant_collection_name_baai_bgem3)

# 建立 collection
# index 設定參考 https://qdrant.tech/documentation/concepts/indexing/
qdrant_client.create_collection(
    collection_name=qdrant_collection_name_baai_bgem3,
    vectors_config={
        "dense_text": VectorParams(size=dense_embeddings_dim_1024, distance=Distance.EUCLID)
    },
    sparse_vectors_config={
        "sparse_text": SparseVectorParams(index=SparseIndexParams(on_disk=False), modifier="idf"),
    },
)

# 查看 collection 資訊
# import yaml
# qdrant_collection_info=qdrant_client.get_collection(collection_name=qdrant_collection_name)
# print(yaml.dump(dict(qdrant_collection_info), default_flow_style=False) )
! curl -X GET {qdrant_url}/collections/{qdrant_collection_name_baai_bgem3} --header "api-key:{qdrant_api_key}" | python -m json.tool

### 1.2 Microsoft multilingual-e5-large

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams


# 參考
# https://qdrant.tech/documentation/concepts/collections/
#
# Qdrant 不用定義 schema，欄位都是動態

# Qdrant client
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key
)

# 檢查 collection 是否存在，存在則刪掉
is_col_exist = qdrant_client.collection_exists(collection_name=qdrant_collection_name_microsoft_multilingual_e5_large)
if is_col_exist:
    qdrant_client.delete_collection(collection_name=qdrant_collection_name_microsoft_multilingual_e5_large)

# 建立 collection
# index 設定參考 https://qdrant.tech/documentation/concepts/indexing/
qdrant_client.create_collection(
    collection_name=qdrant_collection_name_microsoft_multilingual_e5_large,
    vectors_config={
        "dense_text": VectorParams(size=dense_embeddings_dim_1024, distance=Distance.EUCLID)
    },
)

# 查看 collection 資訊
# import yaml
# qdrant_collection_info=qdrant_client.get_collection(collection_name=qdrant_collection_name)
# print(yaml.dump(dict(qdrant_collection_info), default_flow_style=False) )
! curl -X GET {qdrant_url}/collections/{qdrant_collection_name_multilingual_e5_large} --header "api-key:{qdrant_api_key}" | python -m json.tool

### 1.3 Cohere multilingual v3

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams


# 參考
# https://qdrant.tech/documentation/concepts/collections/
#
# Qdrant 不用定義 schema，欄位都是動態

# Qdrant client
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key
)

# 檢查 collection 是否存在，存在則刪掉
is_col_exist = qdrant_client.collection_exists(collection_name=qdrant_collection_name_cohere_multilingual_v3)
if is_col_exist:
    qdrant_client.delete_collection(collection_name=qdrant_collection_name_cohere_multilingual_v3)

# 建立 collection
# index 設定參考 https://qdrant.tech/documentation/concepts/indexing/
qdrant_client.create_collection(
    collection_name=qdrant_collection_name_cohere_multilingual_v3,
    vectors_config={
        "dense_text": VectorParams(size=dense_embeddings_dim_1024, distance=Distance.EUCLID)
    },
)

# 查看 collection 資訊
# import yaml
# qdrant_collection_info=qdrant_client.get_collection(collection_name=qdrant_collection_name)
# print(yaml.dump(dict(qdrant_collection_info), default_flow_style=False) )
! curl -X GET {qdrant_url}/collections/{qdrant_collection_name_cohere_multilingual_v3} --header "api-key:{qdrant_api_key}" | python -m json.tool


# 建立 indx. =>. filter condition時 matchValue 要用到 
qdrant_client.create_payload_index(
    collection_name=qdrant_collection_name_cohere_multilingual_v3,
    field_name="metadata.doc_name",
    field_schema="keyword",
)

# 建立 indx. =>. filter condition時 matchText 要用到 
qdrant_client.create_payload_index(
    collection_name=qdrant_collection_name_cohere_multilingual_v3,
    field_name="metadata.doc_name",
    field_schema="text",
)

# 刪除 indx.
# qdrant_client.delete_payload_index(
#     collection_name=qdrant_collection_name_cohere_multilingual_v3,
#     field_name="metadata.dec_name",
# )

### 1.4 Gemini Embedding Exp 03 07

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams


# 參考
# https://qdrant.tech/documentation/concepts/collections/
#
# Qdrant 不用定義 schema，欄位都是動態

# Qdrant client
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key
)

# 檢查 collection 是否存在，存在則刪掉
is_col_exist = qdrant_client.collection_exists(collection_name=qdrant_collection_name_gemini_exp_03_07)
if is_col_exist:
    qdrant_client.delete_collection(collection_name=qdrant_collection_name_gemini_exp_03_07)

# 建立 collection
# index 設定參考 https://qdrant.tech/documentation/concepts/indexing/
qdrant_client.create_collection(
    collection_name=qdrant_collection_name_gemini_exp_03_07,
    vectors_config={
        "dense_text": VectorParams(size=dense_embeddings_dim_3072, distance=Distance.EUCLID)
    },
)

# 查看 collection 資訊
# import yaml
# qdrant_collection_info=qdrant_client.get_collection(collection_name=qdrant_collection_name)
# print(yaml.dump(dict(qdrant_collection_info), default_flow_style=False) )
! curl -X GET {qdrant_url}/collections/{qdrant_collection_name_gemini_exp_03_07} --header "api-key:{qdrant_api_key}" | python -m json.tool



# 建立 indx. =>. filter condition時 matchValue 要用到 
qdrant_client.create_payload_index(
    collection_name=qdrant_collection_name_gemini_exp_03_07,
    field_name="metadata.doc_name",
    field_schema="keyword",
)

# 建立 indx. =>. filter condition時 matchText 要用到 
qdrant_client.create_payload_index(
    collection_name=qdrant_collection_name_gemini_exp_03_07,
    field_name="metadata.doc_name",
    field_schema="text",
)

# 刪除 indx.
# qdrant_client.delete_payload_index(
#     collection_name=qdrant_collection_name_gemini_exp_03_07,
#     field_name="metadata.dec_name",
# )

In [None]:
# 建立 indx. =>. filter condition時要用到
qdrant_client.create_payload_index(
    collection_name=qdrant_collection_name_gemini_exp_03_07,
    field_name="metadata.doc_name",
    field_schema="keyword",
)

# 建立 indx. =>. filter condition時 matchText 要用到 
qdrant_client.create_payload_index(
    collection_name=qdrant_collection_name_gemini_exp_03_07,
    field_name="metadata.doc_name",
    field_schema="text",
)

# 刪除 indx.
# qdrant_client.delete_payload_index(
#     collection_name=qdrant_collection_name_cohere_multilingual_v3,
#     field_name="metadata.dec_name",
# )

# 文件嵌入

## 1.　Langchain vectorStore 建立

> langchain 的 vector store 可以用來查也可以用來 insert document

#### 1.1 BAAI BGE-M3

In [None]:
from langchain_qdrant import QdrantVectorStore, RetrievalMode

# 建立 langchain vector store
vector_store_bge_m3 = QdrantVectorStore.from_existing_collection(
    url=qdrant_url,
    api_key=qdrant_api_key,
    collection_name=qdrant_collection_name_baai_bgem3,
    # 密集向量區
    embedding=dense_embeddings,
    vector_name="dense_text",
    distance="Euclid", # Euclidean distance，歐氏距離 (L2), Inner Product，內積 (IP), Cosine Similarity，餘弦相似性 (COSINE),
    # /密集向量區
    # 稀疏向量區
    sparse_embedding=sparse_embeddings,
    sparse_vector_name="sparse_text",
    # /稀疏向量區
    retrieval_mode=RetrievalMode.HYBRID, # 混合檢索，必須搭配密集+稀疏向量
)

# 密集向量 => 主要用於需要瞭解資料語意的情境，例如語意搜尋和推薦系統
#        https://milvus.io/docs/zh-hant/dense-vector.md
#
# 稀疏向量 => 是資訊檢索和自然語言處理中重要的資料表示方法。
#        雖然密集向量因其優異的語意理解能力而廣受歡迎，
#        但在需要精確匹配關鍵字或詞組的應用程式時，稀疏向量通常能提供更精確的結果。
#        https://milvus.io/docs/zh-hant/sparse_vector.md
#
# 混合搜尋 => 指的是一種同時進行多個 ANN 搜尋、從這些 ANN 搜尋中重新排序多組結果，並最終返回單一結果集的搜尋方法。
#        使用 Hybrid Search 可以提高搜尋準確度。Hybrid Search 最常用於包括稀疏密集向量搜尋和多模式搜尋等情況。
#        https://milvus.io/docs/zh-hant/multi-vector-search.md

### 1.2 Microsoft multilingual-e5-large

In [None]:
from langchain_qdrant import QdrantVectorStore, RetrievalMode

# 建立 langchain vector store
vector_store_multilingual_e5_large = QdrantVectorStore.from_existing_collection(
    url=qdrant_url,
    api_key=qdrant_api_key,
    collection_name=qdrant_collection_name_microsoft_multilingual_e5_large,
    # 密集向量區
    embedding=embeddings_multilingual_e5_large,
    vector_name="dense_text",
    distance="Euclid", # Euclidean distance，歐氏距離 (L2), Inner Product，內積 (IP), Cosine Similarity，餘弦相似性 (COSINE),
    # /密集向量區
)

### 1.3 Cohere multilingual v3

In [None]:
from langchain_qdrant import QdrantVectorStore, RetrievalMode

# 建立 langchain vector store
vector_store_cohere_multilingual_v3 = QdrantVectorStore.from_existing_collection(
    url=qdrant_url,
    api_key=qdrant_api_key,
    collection_name=qdrant_collection_name_cohere_multilingual_v3,
    # 密集向量區
    embedding=embeddings_aws_bedrock_choere,
    vector_name="dense_text",
    distance="Euclid", # Euclidean distance，歐氏距離 (L2), Inner Product，內積 (IP), Cosine Similarity，餘弦相似性 (COSINE),
    # /密集向量區
)

### 1.4 Gemini Embedding Exp 03 07

In [None]:
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams

# 建立 langchain vector store
vector_store_gemini_exp_03_07 = QdrantVectorStore.from_existing_collection(
    url=qdrant_url,
    api_key=qdrant_api_key,
    collection_name=qdrant_collection_name_gemini_exp_03_07,
    # 密集向量區
    embedding=embeddings_gemini_exp_03_07,
    vector_name="dense_text",
    distance=Distance.EUCLID, # Euclidean distance，歐氏距離 (L2), Inner Product，內積 (IP), Cosine Similarity，餘弦相似性 (COSINE),
    # /密集向量區
)

## 2.　文件嵌入

### 2.1 從文件 insert 向量

#### 2.1.1 BAAI BGE-M3

In [None]:
import os
from glob import glob
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from uuid import uuid4
from langchain_core.documents import Document

for file_path in tqdm(glob("./ctbc_sa_doc/**/*.pdf", recursive=True)):
    file_full_path, file_extension = os.path.splitext(file_path)
    file_name = os.path.basename(file_full_path)
    loader = PyPDFLoader(file_path)
    docs = loader.load()

    print("文件", file_name, "頁數", len(docs))
    for i in range(0, len(docs), 5):
        langchain_doc_list = [Document(page_content = d.page_content, metadata={"doc_name": file_name}) for d in docs[i:i+5]]
        uuids = [str(uuid4()) for _ in range(len(langchain_doc_list))]
        vector_store_bge_m3.add_documents(documents=langchain_doc_list, ids=uuids)

#### 2.1.2 Microsoft multilingual-e5-large

In [None]:
import os
from glob import glob
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from uuid import uuid4
from langchain_core.documents import Document

for file_path in tqdm(glob("...", recursive=True)):
    file_full_path, file_extension = os.path.splitext(file_path)
    file_name = os.path.basename(file_full_path)
    loader = PyPDFLoader(file_path)
    docs = loader.load()

    print("文件", file_name, "頁數", len(docs))
    for i in range(0, len(docs), 30):
        langchain_doc_list = [Document(page_content = d.page_content, metadata={"doc_name": file_name}) for d in docs[i:i+30]]
        uuids = [str(uuid4()) for _ in range(len(langchain_doc_list))]
        vector_store_multilingual_e5_large.add_documents(documents=langchain_doc_list, ids=uuids)


#### 2.1.3 Cohere multilingual v3

In [None]:
for i in range(0, 3429, 2048):
    print(i+2048)

x="1234567890"
if len(x) >9:
    print(">9")
else:
    print("<9")

In [None]:
import os
from glob import glob
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from uuid import uuid4
from langchain_core.documents import Document

for file_path in tqdm(glob("...", recursive=True)):
    file_full_path, file_extension = os.path.splitext(file_path)
    file_name = os.path.basename(file_full_path)
    loader = PyPDFLoader(file_path)
    docs = loader.load()

    print("文件", file_name, "頁數", len(docs))

    for i in range(0, len(docs), 30):
        langchain_doc_list=[]
        for d in docs[i:i+30]:
            if len(d.page_content) >2048:
                chunks, chunk_size = len(d.page_content), len(d.page_content)//2048
                print(len(d.page_content),chunks,chunk_size)
                langchain_doc_list+=[ Document(page_content=d.page_content[i:i+2048] , metadata={"doc_name": file_name}) for i in range(0, chunks, 2048) ]
            else:
                
                langchain_doc_list.append(Document(page_content=d.page_content, metadata={"doc_name": file_name}))
        print("len langchain_doc_list",len(langchain_doc_list))
        uuids = [str(uuid4()) for _ in range(len(langchain_doc_list))]
        vector_store_cohere_multilingual_v3.add_documents(documents=langchain_doc_list, ids=uuids)


#### 2.1.4 Gemini Embedding Exp 03 07

In [None]:
import os
import time
from glob import glob
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from uuid import uuid4
from langchain_core.documents import Document

# PDF
# 從文件插入資料
for file_path in tqdm(glob("..", recursive=True)):
    file_full_path, file_extension = os.path.splitext(file_path)
    file_name = os.path.basename(file_full_path)
    loader = PyPDFLoader(file_path)
    docs = loader.load()

    print("文件", file_name, "頁數", len(docs))

    # langchain_doc_list=[]
    for i in range(0, len(docs)-1, 1):   
        langchain_doc_list=[]
        langchain_doc_list.append(Document(page_content=docs[i].page_content, metadata={"doc_name": file_name}))
        uuids = [str(uuid4()) for _ in range(len(langchain_doc_list))]
        vector_store_gemini_exp_03_07.add_documents(documents=langchain_doc_list, ids=uuids)
        time.sleep(20)
        

### 2.2 test

In [None]:
from uuid import uuid4
from langchain_core.documents import Document

documents = [
    Document(
        page_content="漢皇重色思傾國，御宇多年求不得。",
        metadata={"doc_name": "sentence1"},
    ),
    Document(
        page_content="楊家有女初長成，養在深閏人未識。",
        metadata={"doc_name": "sentence2"},
    ),
    Document(
        page_content="天生麗質難自棄，一朝選在君王側。",
        metadata={"doc_name": "sentence3"},
    ),
    Document(
        page_content="回眸一笑百媚生，六宮粉黛無顏色。",
        metadata={"doc_name": "sentence4"},
    )
]

uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)


In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams


# Qdrant client
qdrant_client = QdrantClient(
    url='...',
    api_key='...',
)

collection_name_text_embedding_004='sa-docs-collection-text-embedding-004'

# 檢查 collection 是否存在，存在則刪掉
is_col_exist = qdrant_client.collection_exists(collection_name=collection_name_text_embedding_004)
if is_col_exist:
    qdrant_client.delete_collection(collection_name=collection_name_text_embedding_004)

# 建立 collection
# index 設定參考 https://qdrant.tech/documentation/concepts/indexing/
qdrant_client.create_collection(
    collection_name=collection_name_text_embedding_004,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)
)

! curl -X GET {qdrant_url}/collections/{qdrant_collection_name_multilingual_e5_large} --header "api-key:{qdrant_api_key}" | python -m json.tool

In [None]:
import time
from glob import glob
import os
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from uuid import uuid4
from langchain_core.documents import Document

embeddings_text_embedding_004 = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key='...')

# 建立 langchain vector store
vector_store_text_embedding_004 = QdrantVectorStore.from_existing_collection(
    url='...',
    api_key='api-key',
    collection_name=collection_name_text_embedding_004,
    # 密集向量區
    embedding=embeddings_text_embedding_004,
    distance="Cosine",  # Euclidean distance，歐氏距離 (L2), Inner Product，內積 (IP), Cosine Similarity，餘弦相似性 (COSINE),
    # /密集向量區
)

for file_path in tqdm(glob("...", recursive=True)):
    file_full_path, file_extension = os.path.splitext(file_path)
    file_name = os.path.basename(file_full_path)
    loader = UnstructuredWordDocumentLoader(file_path)
    docs = loader.load()
    print("文件", file_name, "頁數", len(docs))

    for i in range(0, len(docs), 30):
        langchain_doc_list=[]
        for d in docs[i:i+30]:
            if len(d.page_content) >2048:
                chunks, chunk_size = len(d.page_content), len(d.page_content)//2048
                print("len>2048",len(d.page_content),chunks,chunk_size)
                langchain_doc_list+=[ Document(page_content="",content=d.page_content[i:i+2048] , metadata={"doc_name": file_name}) for i in range(0, chunks, 2048) ]
            else:
                langchain_doc_list.append(Document(page_content="",content=d.page_content, metadata={"doc_name": file_name}))
        print("嵌入 -> len langchain_doc_list",len(langchain_doc_list))
        uuids = [str(uuid4()) for _ in range(len(langchain_doc_list))]
        vector_store_text_embedding_004.add_documents(documents=langchain_doc_list, ids=uuids)
        time.sleep(10)

In [None]:
%pip install -qU langchain_postgres
%pip install "psycopg[binary]"

In [None]:
from langchain_postgres import PGVector
import time
from glob import glob
import os
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from uuid import uuid4
from langchain_core.documents import Document

embeddings_text_embedding_004 = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key='...')

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "..."  # Uses psycopg3!
collection_name = "my_docs"

vector_store = PGVector(
    embeddings=embeddings_text_embedding_004,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

from langchain_core.documents import Document

docs = [
    Document(
        page_content="there are cats in the pond",
        metadata={"id": 1, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="ducks are also found in the pond",
        metadata={"id": 2, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="fresh apples are available at the market",
        metadata={"id": 3, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the market also sells fresh oranges",
        metadata={"id": 4, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the new art exhibit is fascinating",
        metadata={"id": 5, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a sculpture exhibit is also at the museum",
        metadata={"id": 6, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a new coffee shop opened on Main Street",
        metadata={"id": 7, "location": "Main Street", "topic": "food"},
    ),
    Document(
        page_content="the book club meets at the library",
        metadata={"id": 8, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="the library hosts a weekly story time for kids",
        metadata={"id": 9, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="a cooking class for beginners is offered at the community center",
        metadata={"id": 10, "location": "community center", "topic": "classes"},
    ),
]

vector_store.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

# 查詢測試

> metadata 是 json 物件，可以用裡面的屬性作 filter

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

# qdrant 的 filter 參考 https://qdrant.tech/documentation/concepts/filtering/
results = vector_store_bge_m3.similarity_search(
    "台外幣歷史明細",
    k=10, # 查出來的筆數限制
    filter=Filter(
        should=[
            # FieldCondition(
            #     key="metadata.doc_name",
            #     match=MatchValue(
            #         value="sentence1"
            #     ),
            # ),
        ]
    ),
)
for res in results:
    # print(f"* {res.page_content} [{res.metadata}]")
    print(res.metadata["doc_name"],res.page_content.replace("\n","<br>"))

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

# qdrant 的 filter 參考 https://qdrant.tech/documentation/concepts/filtering/
results = vector_store_multilingual_e5_large.similarity_search(
    "高齡保險",
    k=10, # 查出來的筆數限制
    filter=Filter(
        should=[
            # FieldCondition(
            #     key="metadata.doc_name",
            #     match=MatchValue(
            #         value="sentence1"
            #     ),
            # ),
        ]
    ),
)
for res in results:
    # print(f"* {res.page_content} [{res.metadata}]")
    print(res.metadata["doc_name"],res.page_content.replace("\n","<br>"))

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchText

# qdrant 的 filter 參考 https://qdrant.tech/documentation/concepts/filtering/
results = vector_store_cohere_multilingual_v3.similarity_search(
    "...",
    k=10, # 查出來的筆數限制
    filter=Filter(
        should=[
            FieldCondition(
                key="metadata.doc_name",
                match=MatchText(
                    text="..."
                ),
            ),
        ]
    ),
)
for res in results:
    # print(f"* {res.page_content} [{res.metadata}]")
    print(res.metadata["doc_name"],res.page_content.replace("\n","<br>"))

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchText

# qdrant 的 filter 參考 https://qdrant.tech/documentation/concepts/filtering/
results = vector_store_gemini_exp_03_07.similarity_search(
    "...",
    k=10, # 查出來的筆數限制
    filter=Filter(
        should=[
            FieldCondition(
                key="metadata.doc_name",
                match=MatchText(
                    text="..."
                ),
            ),
        ]
    ),
)
for res in results:
    # print(f"* {res.page_content} [{res.metadata}]")
    print(res.metadata["doc_name"],res.page_content.replace("\n","<br>"))