In [None]:
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_ollama import OllamaLLM
import faiss
# 保存文档和嵌入
import pickle

In [None]:
question5 = """
背景信息： 用户提供物种为人类，组织类型为PBMC，目标细胞类型为“小类细胞类型（对应大类细胞类型为T细胞）”。
差异基因： IL7R, TMSB10, CD4, ITGB1, LTB, TRAC, AQP3, LDHB, IL32, MAL
请给出预测的细胞类型
"""
question6 = """
背景信息： 用户提供物种为人类，组织类型为PBMC，目标细胞类型为“小类细胞类型（对应大类细胞类型为T细胞）”。
差异基因： IL7R, TMSB10, CD4, ITGB1
请给出预测的细胞类型
"""

question7 = """
背景信息： 用户提供物种为人类，组织类型为PBMC，目标细胞类型为“小类细胞类型（对应大类细胞类型为T细胞）”。
差异基因： IL7R, TMSB10, CD4, ITGB1, HBD, HBM, AHSP, ALAS2
请给出预测的细胞类型
"""

In [None]:
# llm = OllamaLLM(model="llama3.1")
llm = OllamaLLM(model="CellType",temperature=1)

response = llm.invoke(question5)
print(response)

In [None]:
loader = UnstructuredFileLoader("azimuth_cellmarker_v2.pdf")

docs = loader.load()

In [None]:

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=200
)
texts = text_splitter.split_documents(docs)

In [None]:
embeddings = HuggingFaceEmbeddings(cache_dir="./embedding")

In [None]:
# pip install faiss-cpu
db = FAISS.from_documents(texts, embeddings)

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 2})

In [None]:
# 将数据保存起来，方便下次直接使用，而不是重新运行上述步骤
with open('single_cell_markers_documents.pkl', 'wb') as f:
    pickle.dump(texts, f)

with open('single_cell_markers_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

In [None]:
# 读取文档和嵌入
with open('documents.pkl', 'rb') as f:
    texts = pickle.load(f)

with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)

# 创建新的 FAISS 数据库对象
db = FAISS.from_documents(texts, embeddings)

# 获取检索器
retriever = db.as_retriever()

In [None]:
chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever
)

In [None]:
question = "请总结一下文档"
result = chain.invoke({"query": question})

print(result['result'])

In [None]:
result = chain.invoke({"query": question5})
print(result['result'])

In [None]:
result = chain.invoke({"query": question6})
print(result['result'])

In [None]:
result = chain.invoke({"query": question7})
print(result['result'])