###Package Installation

In [1]:
!pip install faiss-cpu langchain langchain-community langchain-google-genai pandas python-dotenv



In [23]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from pathlib import Path
from langchain.chat_models import init_chat_model
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"]=userdata.get('GEMINI_API_KEY')

llm = init_chat_model("gemini-2.5-flash-lite", model_provider="google_genai")

###Download the CSV File

In [24]:
os.makedirs('data', exist_ok=True)
!wget -O data/customers-100.csv https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/customers-100.csv

--2025-09-11 17:41:37--  https://raw.githubusercontent.com/NirDiamant/RAG_TECHNIQUES/main/data/customers-100.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17160 (17K) [text/plain]
Saving to: ‘data/customers-100.csv’


2025-09-11 17:41:37 (10.5 MB/s) - ‘data/customers-100.csv’ saved [17160/17160]



In [25]:
import pandas as pd

file_path = ('data/customers-100.csv') # insert the path of the csv file
data = pd.read_csv(file_path)

#preview the csv file
data.head()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
3,4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
4,5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/


###Load and Process CSV Data

In [26]:
loader = CSVLoader(file_path=file_path)
docs = loader.load_and_split()
##line docs = loader.load_and_split() reads your CSV file and creates a list called docs, where each element in the list represents a row from your CSV file, formatted as a document object that can be used in subsequent steps for tasks like creating embeddings and building a vector store.

###Initiate the Vector Store

In [27]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")
index = faiss.IndexFlatL2(len(embeddings.embed_query(" ")))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

###Add the splitted csv data to the vector store

In [28]:
vector_store.add_documents(documents=docs)

['f2bde449-83f5-4b05-a53c-f1d82325180d',
 'a4d9e127-a452-4662-b483-eea1bea49298',
 '147f4de1-6233-4844-9a1a-30c4eb862741',
 '8b25939d-d874-4204-990a-780712ff278b',
 'ab995cd1-71b2-4eba-b999-9889b32afe15',
 '7f698904-55e4-4191-840b-5e2decd08a73',
 '381af6c6-d400-466c-a889-bd7b859b9ed4',
 '3557359e-528e-4a1c-90a3-c681d3df1c92',
 '0108f141-a1db-495d-9397-2ea590f5ee23',
 '6f351d02-87cf-4cdb-bb92-1ad9b3df784a',
 '42228f07-e2c4-4021-9490-7a7f6febd27d',
 'ff6c3642-18f2-492d-8786-22df1c68a84b',
 'dbdbabf1-020b-4412-b38c-b04cee41fb71',
 'a9a4e496-d489-42f1-8c7d-517311a36932',
 'c296a00a-8047-4a7f-823e-306777b15207',
 '8c5f4102-f8f3-4e52-9edc-70981e15d0ba',
 '2af2d8fa-8d91-42f9-bfc1-09c6ea9aae3f',
 '560acee8-a2ac-42e5-b72b-46d25abc966f',
 '09de7572-cf3a-4a10-846e-aa3aa9007a49',
 '918db85a-3d1b-44a0-adca-4986a0cce583',
 'b5511ea4-2c91-4f58-bf7c-e8781255f41b',
 '3b7d0956-e467-4311-8c32-3db4fccac459',
 'effbfe2c-4e8d-41b9-9d9a-d82cbc286cf6',
 'a2b85320-0912-4547-9423-e1ea5b923c38',
 '3fcb4ac8-9ed0-

###Create the retrieval chain

In [29]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

retriever = vector_store.as_retriever()

# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),

])

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

###Query the RAG bot with a question based on the CSV data

In [30]:
answer= rag_chain.invoke({"input": "which company does sheryl Baxter work for?"})
answer['answer']

'Sheryl Baxter works for Rasmussen Group.'