In [21]:
from langchain.document_loaders import TextLoader, PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def upload_document(file_path):
    print("CHUNKING OF DOCUMENT: ...")
    
    # Check file type
    if file_path.endswith(".txt"):
        loader = TextLoader(file_path)
        documents = loader.load()
    elif file_path.endswith(".pdf"):
        loader = PDFMinerLoader(file_path, concatenate_pages=True)
        documents = loader.load()
    else:
        raise NotImplementedError("Unsupported file type. Only .txt and .pdf are supported.")
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n"], chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)
    print("CHUNKING COMPLETE!")
    
    return docs

In [22]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

def initialize_vector_db(docs):
    print("EMBEDDING THE DOCUMENTS: ...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_db = Chroma.from_documents(docs, embeddings, persist_directory="./db")
    vector_db.persist()
    print("EMBEDDING DONE!")
    
    return vector_db


In [23]:
def retrieve_context(vector_db, query, k=3):
    return vector_db.similarity_search(query, k=k)

In [24]:
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from envvar import GEMINI_API_KEY
genai.configure(api_key=GEMINI_API_KEY)


model = genai.GenerativeModel(model_name = "gemini-1.5-flash-002", system_instruction="When you answer the questions with the provided context, do not say that you rely on context, just answer the question, like pretending that knew this information before it was provided to you.")


def generate_response_with_context(model, context, query):
    print("PULING KNOWLEDGE TO MODEL: ...")
    full_context = "\n".join([doc.page_content for doc in context])
    prompt = f"Relying on this Context answer my question. But keep in mind that you have to answer this question only relying on this context:\n{full_context}\n\nQuestion:\n{query}"
    
    try:
        _response = model.generate_content(prompt)
        response = _response.candidates[0].content.parts[0].text
        role = _response.candidates[0].content.role
        print("PULL OF KNOWLEDGE DONE!")
        
        return response

    except Exception as e:
        print(f"Error encountered: {e}")
        return f"Failure: {e}"

In [25]:
def main_workflow(file_path, model, user_question):
    # Step 1: Upload Document
    docs = upload_document(file_path)
    
    # Step 2: Initialize Vector Database
    vector_db = initialize_vector_db(docs)
    
    # Step 3: Retrieve Context for User's Question
    context = retrieve_context(vector_db, user_question)
    [print("\n\nCONTEXT RETRIVED:")]
    for part in context:
        print(part.page_content)
        print("-------------")
    
    # Step 4: Generate Answer with Gemini API
    answer = generate_response_with_context(model, context, user_question)
    
    return answer


In [27]:
file_path = "documents/monopoly_instructions.pdf"
user_question = "How to get out of the jail?"

response = main_workflow(file_path, model, user_question)
print("RESPONSE:")
print(response)


CHUNKING OF DOCUMENT: ...
CHUNKING COMPLETE!
EMBEDDING THE DOCUMENTS: ...
EMBEDDING DONE!


CONTEXT RETRIVED:
To get out of Jail you may:
-------------
To get out of Jail you may:
-------------
use a "Get Out Of Jail Free" card if you have one, or
-------------
PULING KNOWLEDGE TO MODEL: ...
PULL OF KNOWLEDGE DONE!
RESPONSE:
You can get out of jail by using a "Get Out Of Jail Free" card, if you have one.

