In [2]:
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [3]:
def load_and_process_data(file_path):
    def metadata_func(record: dict, metadata: dict) -> dict:
        metadata["title"] = record.get("title", "")
        return metadata
    
    loader = JSONLoader(
        file_path=file_path,
        jq_schema='.[]',
        content_key='text',
        metadata_func=metadata_func
    )
    
    documents = loader.load()
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=125,
        chunk_overlap=25
    )
    texts = text_splitter.split_documents(documents)
    
    return texts

In [4]:
texts = load_and_process_data('dataset/dataset.json')

In [5]:
def create_vector_store(texts):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
    db = FAISS.from_documents(texts, embeddings)
    return db


In [6]:
db = create_vector_store(texts)

  embeddings = HuggingFaceEmbeddings(


In [7]:
query = "What is mendel personal background?"
results = db.similarity_search(query, k=1)  
results

[Document(id='637eecf3-02ba-481b-8efa-d6025d5d8f72', metadata={'source': '/home/mendel/Desktop/Mendel/ArtificialIntelligence/RagApplication/dataset/dataset.json', 'seq_num': 1, 'title': 'Personal Background'}, page_content='Mendel, whose full name is Chewang Gyalpu Bhutia, is an AI enthusiast and full-stack software developer from India. He is')]

In [8]:
model_name = "google/flan-t5-base"  # or "google/flan-t5-small" for lighter version
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [9]:
def initialize_qa_chain(db):
    pipe = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=384,
        temperature=1.0
    )
    
    llm = HuggingFacePipeline(pipeline=pipe)
    
    retriever = db.as_retriever(search_kwargs={"k": 9})
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

In [10]:
qa_chain = initialize_qa_chain(db)

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)


In [11]:
query = "Tell me something about mendel"
result = qa_chain.invoke({"query": query}) 

print("\nQuestion:", query)
print("Answer:", result["result"])
print("\nSources:")
for doc in result["source_documents"]:
    print(f"- {doc.metadata['title']}")


Question: Tell me something about mendel
Answer: Mendel, whose full name is Chewang Gyalpu Bhutia, is an AI enthusiast and full-stack software developer from India. He is Beyond technology, Mendel enjoys reading the Bhagavad Gita, studying philosophical texts, and exploring topics like personal Mendel is actively learning about quantitative finance and aims to become a quant trading developer. He has created Mendel is currently pursuing his Master of Computer Applications (MCA) from Indira Gandhi National Open University (IGNOU), He is proficient in programming languages like Python, JavaScript, and Java. Mendel works fluently with full-stack In addition to coding, Mendel has excellent communication and technical writing skills. He has prepared complete Mendel has gained valuable industry experience through internships and freelance roles. At Ensemble Control Inc., he worked Mendel’s long-term goal is to build cutting-edge, AI-powered systems that solve real-world financial and analyt