In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.callbacks import get_openai_callback, openai_info
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_BASE_URL") or None

In [4]:
llm = ChatOpenAI(
            openai_api_key=api_key,
            base_url=base_url,
        )

In [5]:
embeddings = OpenAIEmbeddings(
            openai_api_key=api_key,
            base_url=base_url, 
        )

In [7]:
test_chunks = ["Initialize a Chroma Database.", "Hello World!"]
if os.path.exists("database\\chroma.sqlite3"):
    stored_vectors = Chroma(
                embedding_function=embeddings,
                persist_directory="database",
            )
else:
    stored_vectors = Chroma.from_texts(
        texts=test_chunks,
        embedding=embeddings,
        persist_directory="database",
    )

In [8]:
prompt_template="Answer the following question based on the provided knowledge: \nYou will give 100 dollars tips if you give reliable answer\n<knowledge>\n{context}\n</knowledge>\nQuestion: {input}"

retrieval_prompt = ChatPromptTemplate.from_template(prompt_template)

documents_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=retrieval_prompt,
)

retrieval_chain = create_retrieval_chain(stored_vectors.as_retriever(), documents_chain)

In [14]:
def get_answer(question):
    response = retrieval_chain.invoke({"input": question})
    return response["answer"]

In [10]:
%pip install docx2txt
import docx2txt

Note: you may need to restart the kernel to use updated packages.


In [11]:
def add_to_vectorstore( corpus_data, metadata=None, chunk_size=1500, overlap=100
    ):
        """
        Adds the vectorized text content to the vector store.

        Args:
            corpus_data (str): The text content to be vectorized and added.
        """
        corpus_length = len(corpus_data)
        print(f"Processing Text Corpus File with {corpus_length} Characters...")
        # Splitting text into 1500-character chunks with 100-character overlap
        chunks = [
            corpus_data[i : i + chunk_size]
            for i in range(0, corpus_length, chunk_size - overlap)
        ]
        num_chunks = len(chunks)
        for i in range(0, num_chunks, 10):
            chunk_subset = chunks[i : i + 10]
            stored_vectors.add_texts(
                texts=chunk_subset,
                metadatas=metadata,
            )
            print(f"Processed {i + len(chunk_subset)}/{num_chunks} Items in Corpus!")

In [12]:
def vecterize_corpus(file_path, file_type):
        """
        Vectorizes the corpus data from the specified file.

        Args:
            file_path (str): The path to the file.
            file_type (str): The type of the file (e.g., .docx).
        """
        # 根据文件类型处理文件内容

        if file_type in [".docx"]:
            text_content = docx2txt.process(file_path)
            add_to_vectorstore(corpus_data=text_content)

        print(f"File '{file_path}' is vecterizied.")

In [None]:
vecterize_corpus('docs\\DDE白皮书（英文）.docx','.docx')

In [15]:
retrieval_chain = create_retrieval_chain(stored_vectors.as_retriever(), documents_chain)

question1="What is the The overall MISSION of DDE?"
print("Question:",question1)
answer1=get_answer(question1)
print("Answer:",answer1)

Question: What is the The overall MISSION of DDE?
Answer: The overall mission of DDE is to transform Earth science by harmonizing global geoscience data, sharing global geoscience knowledge, developing and disseminating advanced methods to analyze and visualize data, and fostering a deep-time data-driven research paradigm.
