In [1]:
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [21]:
def load_and_process_data(file_path):
    def metadata_func(record: dict, metadata: dict) -> dict:
        metadata["title"] = record.get("title", "")
        return metadata
    
    loader = JSONLoader(
        file_path=file_path,
        jq_schema='.[]',
        content_key='text',
        metadata_func=metadata_func
    )
    
    documents = loader.load()
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=125,
        chunk_overlap=25
    )
    texts = text_splitter.split_documents(documents)
    
    return texts

In [22]:
texts = load_and_process_data('dataset/dataset.json')

In [23]:
def create_vector_store(texts):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
    db = FAISS.from_documents(texts, embeddings)
    return db


In [24]:
db = create_vector_store(texts)

In [25]:
query = "What is mendel personal background?"
results = db.similarity_search(query, k=1)  

In [26]:
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI

os.environ["GOOGLE_API_KEY"] = "AIzaSyCX4zV-fK2997XaHd8uipgSlDxm2CwrRGI"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [30]:
def initialize_qa_chain(db):
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.0-flash",
        temperature=0.5,
        max_output_tokens=320
    )
    
    retriever = db.as_retriever(search_kwargs={"k": 9})
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )


In [31]:
qa_chain = initialize_qa_chain(db)

In [32]:
query = "Tell me something about mendel background"
result = qa_chain.invoke({"query": query}) 

print("\nQuestion:", query)
print("Answer:", result["result"])
print("\nSources:")
for doc in result["source_documents"]:
    print(f"- {doc.metadata['title']}")


Question: Tell me something about mendel background
Answer: Mendel, whose full name is Chewang Gyalpu Bhutia, is an AI enthusiast and full-stack software developer from India. He is currently pursuing his Master of Computer Applications (MCA) from Indira Gandhi National Open University (IGNOU). He is proficient in programming languages like Python, JavaScript, and Java. Beyond technology, Mendel enjoys reading the Bhagavad Gita, studying philosophical texts, and exploring topics like personal. He has gained valuable industry experience through internships and freelance roles, including a role at Ensemble Control Inc. One of Mendel’s signature projects is a stock price prediction system using yfinance data and Prophet for forecasting. Mendel’s long-term goal is to build cutting-edge, AI-powered systems that solve real-world financial and analytical problems, and he is actively learning about quantitative finance and aims to become a quant trading developer. In addition to coding, Mende