In [4]:
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_ollama import ChatOllama,OllamaEmbeddings
from langchain.vectorstores.faiss import FAISS
from IPython.display import display,Markdown
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch

In [None]:
loader = UnstructuredExcelLoader("updated_sample_file.xlsx")
data = loader.load()
data[:5]

[Document(metadata={'source': 'updated_sample_file.xlsx'}, page_content='first_name last_name Gender Country Age Date Id 1.0 Dulce Abril Female United States 32 2017-10-15 00:00:00 1562 2.0 Mara Hashimoto Female Great Britain 25 2016-08-16 00:00:00 1582 3.0 Philip Gent Male France 36 2015-05-21 00:00:00 2587 4.0 Kathleen Hanner Female United States 25 2017-10-15 00:00:00 3549 5.0 Nereida Magwood Female United States 58 2016-08-16 00:00:00 2468 6.0 Gaston Brumm Male United States 24 2015-05-21 00:00:00 2554 7.0 Etta Hurn Female Great Britain 56 2017-10-15 00:00:00 3598 8.0 Earlean Melgar Female United States 27 2016-08-16 00:00:00 2456 9.0 Vincenza Weiland Female United States 40 2015-05-21 00:00:00 6548 10.0 Fallon Winward Female Great Britain 28 2016-08-16 00:00:00 5486 11.0 Arcelia Bouska Female Great Britain 39 2015-05-21 00:00:00 1258 12.0 Franklyn Unknow Male France 38 2017-10-15 00:00:00 2579 13.0 Sherron Ascencio Female Great Britain 32 2016-08-16 00:00:00 3256 14.0 Marcel Zabri

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 10000,
    chunk_overlap = 100
)

chunks = text_splitter.split_documents(data)
chunks[0]

Document(metadata={'source': 'updated_sample_file.xlsx'}, page_content='first_name last_name Gender Country Age Date Id 1.0 Dulce Abril Female United States 32 2017-10-15 00:00:00 1562 2.0 Mara Hashimoto Female Great Britain 25 2016-08-16 00:00:00 1582 3.0 Philip Gent Male France 36 2015-05-21 00:00:00 2587 4.0 Kathleen Hanner Female United States 25 2017-10-15 00:00:00 3549 5.0 Nereida Magwood Female United States 58 2016-08-16 00:00:00 2468 6.0 Gaston Brumm Male United States 24 2015-05-21 00:00:00 2554 7.0 Etta Hurn Female Great Britain 56 2017-10-15 00:00:00 3598 8.0 Earlean Melgar Female United States 27 2016-08-16 00:00:00 2456 9.0 Vincenza Weiland Female United States 40 2015-05-21 00:00:00 6548 10.0 Fallon Winward Female Great Britain 28 2016-08-16 00:00:00 5486 11.0 Arcelia Bouska Female Great Britain 39 2015-05-21 00:00:00 1258 12.0 Franklyn Unknow Male France 38 2017-10-15 00:00:00 2579 13.0 Sherron Ascencio Female Great Britain 32 2016-08-16 00:00:00 3256 14.0 Marcel Zabris

In [None]:
#Create embeddings with Ollama Models

embeddings = OllamaEmbeddings(model = 'llama3.2',num_gpu=1)


db_faiss = FAISS.from_documents(chunks,embeddings)


In [None]:
#Basic Retrieval 
query = "What is Kathleen Hanner's gender?"
output_retrieval = db_faiss.similarity_search(query,k = 3)
output_retrieval_merged = '\n'.join([doc.page_content for doc in output_retrieval])
display(Markdown(output_retrieval_merged))

In [None]:
#Custom Prompt for the RAG system
prompt = f"Based on the context {output_retrieval_merged}, Answer the following question: {query}.If you do not possess the information on the answer, respond with I dont know."

In [6]:
#Call Ollama Model

ollama_llm = ChatOllama(model = 'llama3.2', num_gpu=1,temperature=0)

# response = ollama_llm.invoke(prompt)