In [19]:
from langchain_community.llms import Ollama
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [20]:
#load the llm
llm = Ollama(model = "gemma:2b-instruct-q4_0" , temperature=0.7)

In [21]:
loader = UnstructuredFileLoader("Data/goog-10-k-2023 (1).pdf")
documents = loader.load()

In [22]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text_chunks = text_splitter.split_documents(documents)

In [23]:
embeddings = HuggingFaceEmbeddings()
knowledge_base = FAISS.from_documents(text_chunks, embeddings)



In [24]:
#retrival chain
qa_chain = RetrievalQA.from_chain_type(
    retriever = knowledge_base.as_retriever(),
    llm = llm
)

In [25]:
question = "What is this documentation?"
response = qa_chain.invoke({"query": question})


In [26]:
print(response['result'])

The context describes a table of contents for an annual report on Form 10-K for Alphabet Inc. It provides information about the company's compensation policies, including director and executive compensation.

The information required by each item will be included in the corresponding captions in the table of contents.


In [27]:
question = "What is the company name here?"
response = qa_chain.invoke({"query": question})


In [28]:
print(response['result'])

The company name in the context is Alphabet Inc.


## Data Ingestion

In [29]:
import os
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from io import BytesIO

def save_uploadedfile(uploadedfile):
    if not os.path.exists("tempDir"):
        os.makedirs("tempDir")
    file_path = os.path.join("tempDir", uploadedfile.name)
    with open(file_path, "wb") as f:
        f.write(uploadedfile.getbuffer())
    return file_path

def process_files(files):
    documents = []
    for file in files:
        file_path = save_uploadedfile(file)
        loader = UnstructuredFileLoader(file_path)
        documents.extend(loader.load())
    return documents

def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    text_chunks = text_splitter.split_documents(documents)
    return text_chunks


## Embeddings

In [4]:
from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS, chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings


embeddings = HuggingFaceEmbeddings()

def create_embeddings(documents):

    # embeddings = OllamaEmbeddings(model="gemma:2b-instruct-q4_0")
    knowledge_base = FAISS.from_documents(documents, embeddings)
    # knowledge_base = chroma.from_documents(documents, embeddings)

    return knowledge_base

def save_embeddings(knowledge_base, file_path):
    # save to disk
    # chroma.from_documents(knowledge_base, embeddings, persist_directory=file_path)
    knowledge_base.save_local(file_path)

def load_embeddings(file_path, ):
    return FAISS.load_local(file_path, embeddings, allow_dangerous_deserialization=True)
    # return chroma(persist_directory=file_path, embedding_function=embeddings)


  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


## QA

In [30]:
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA

def create_qa_chain(knowledge_base):
    llm = Ollama(model="gemma:2b-instruct-q4_0", temperature=0.7)
    qa_chain = RetrievalQA.from_chain_type(
        retriever=knowledge_base.as_retriever(),
        llm=llm
    )
    return qa_chain

def ask_question(qa_chain, question):
    response = qa_chain.invoke({"query": question})
    return response['result']


### Test

In [32]:
prompt = "What is the company name?"

loader = UnstructuredFileLoader("Data/goog-10-k-2023 (1).pdf")
documents = loader.load()


text_split = split_documents(documents=documents)
knowledge_base_1 = create_embeddings(text_split)
qa_chain = create_qa_chain(knowledge_base_1)
response = ask_question(qa_chain, prompt)


In [34]:
print(response)

The company name is Alphabet Inc.

The passage does not explicitly mention the company name, so I cannot answer this question from the provided context.
