#Data Loader

In [1]:
import pandas as pd
df=pd.read_csv("/content/social_ads.csv")
df.head(10)

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0
5,27,58000,0
6,27,84000,0
7,32,150000,1
8,25,33000,0
9,35,65000,0


In [2]:
!pip install langchain langchain_experimental



In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [4]:
loader = CSVLoader(file_path="/content/social_ads.csv")

data = loader.load()

#Performing RAG

In [5]:
!pip install langchain_community
!pip install langchain



In [6]:
print(data[0].page_content)

Age: 19
EstimatedSalary: 19000
Purchased: 0


In [7]:
# Extracting 'page_content' from each Document to create a list of strings
docs = [doc.page_content for doc in data]

In [8]:
len(docs)

400

#Chuncking Dataset

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)

final_documents=text_splitter.split_documents(data)
final_documents[0]

Document(page_content='Age: 19\nEstimatedSalary: 19000\nPurchased: 0', metadata={'source': '/content/social_ads.csv', 'row': 0})

#Creating Embedding (any two of the given models can be used)

In [10]:
!pip install sentence-transformers



In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings


model_name = "sentence-transformers/all-mpnet-base-v2"                              ##"intfloat/multilingual-e5-large-instruct"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,

)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
import  numpy as np
print(np.array(hf.embed_query(final_documents[0].page_content)))
print(np.array(hf.embed_query(final_documents[0].page_content)).shape)

[ 2.44784970e-02  6.13148510e-02  1.03584782e-03  5.50826900e-02
 -2.79420521e-03  1.05631007e-02 -6.02074107e-03  6.08242750e-02
 -3.96785922e-02  2.23864820e-02 -4.02066074e-02  1.14196436e-02
  2.20116526e-02  7.68420175e-02 -3.66670899e-02  3.35766636e-02
  2.66746525e-02 -3.64874192e-02  5.53462394e-02 -3.10288239e-02
 -4.00896072e-02  5.06504588e-02 -5.39047681e-02  2.86887400e-02
 -2.21612155e-02  4.14957013e-03 -5.31985331e-03 -4.57253866e-02
 -1.30391726e-02  2.34159902e-02  2.88971886e-02 -1.28383832e-02
 -2.24524215e-02  1.76495779e-02  1.68600832e-06 -6.07749410e-02
  4.86377580e-03  8.90150294e-03 -3.50683331e-02  9.24707763e-03
  5.36268465e-02  9.73811653e-03  2.48925248e-03 -7.27044512e-03
 -3.58472653e-02 -4.39757928e-02  1.44400811e-02  5.73673435e-02
 -2.95620207e-02 -1.69384535e-02 -1.24622108e-02 -1.53567595e-03
  2.05020253e-02  4.35290206e-03  3.58732231e-02 -5.57376482e-02
 -3.34319174e-02  4.03650291e-02 -3.90722454e-02 -8.48989747e-03
 -7.67062325e-03  1.89236

#Vector Store

In [13]:
!pip install chromadb



In [14]:
from langchain_community.vectorstores import Chroma

vectorstore=Chroma.from_documents(final_documents[:],hf)

In [15]:
query = "what are the features in the csv?"
retireved_results=vectorstore.similarity_search(query)
print(retireved_results[0])

page_content='Age: 42\nEstimatedSalary: 149000\nPurchased: 1' metadata={'row': 240, 'source': '/content/social_ads.csv'}


In [16]:
retriever=vectorstore.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriever)

tags=['Chroma', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x78444fdcb190> search_kwargs={'k': 3}


#Using LLM for query

In [18]:
#!pip install --upgrade huggingface_hub

!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [19]:
!pip install -q -U langchain transformers bitsandbytes accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
!pip uninstall transformers
!pip install transformers

Found existing installation: transformers 4.41.0
Uninstalling transformers-4.41.0:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.41.0.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? y
  Successfully uninstalled transformers-4.41.0
Collecting transformers
  Using cached transformers-4.41.0-py3-none-any.whl (9.1 MB)
Installing collected packages: transformers
Successfully installed transformers-4.41.0


In [17]:
import torch
from transformers import BitsAndBytesConfig
from langchain import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [18]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [19]:
model_4bit = AutoModelForCausalLM.from_pretrained( "mistralai/Mistral-7B-Instruct-v0.1", device_map="auto",quantization_config=quantization_config, )
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [20]:
pipeline_inst = pipeline(
        "text-generation",
        model=model_4bit,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=2500,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [21]:
llm = HuggingFacePipeline(pipeline=pipeline_inst)

  warn_deprecated(


In [22]:
llm.invoke(query)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


'what are the features in the csv? How are the columns named?\n\nComment: The csv is a comma delimited list of 11 values. The columns are named "Name","Age","Occupation","Gender","Salary","Country","State","City","Zip","Phone","Email".\n\nComment: @user2004273 - I see. Then the answer below should work fine.\n\n## Answer (3)\n\nYou can do this with the following:\n\n```\nimport pandas as pd\n\nwith open(\'file.csv\') as csv:\n    data = pd.read_csv(csv)\n\n# get the first row\nrow = data.head()\n\n# print the Name column\nprint(row["Name"])\n\n# print the Age column\nprint(row["Age"])\n\n# print the Salary column\nprint(row["Salary"])\n```\n\n## Answer (1)\n\nYou can read the CSV file and store it in a pandas dataframe. Here is a sample code:\n\n```\nimport pandas as pd\n\n#read the CSV file\ndata = pd.read_csv(\'file.csv\')\n\n#print the name of first row\nprint(data.iloc[0][\'Name\'] )\n\n#print the age of first row\nprint(data.iloc[0][\'Age\'] )\n\n#print the salary of first row\npr

#Output

In [40]:
from langchain_community.llms import HuggingFaceHub

prompt_template = """
Use the following piece of context extracted from a CSV file to answer the question asked.
Please try to provide appropriate answer only based on the context and make sure they are correct.

CSV Context:
{context}

Question:
{question}

Helpful Answer:
"""

from langchain_core.prompts import PromptTemplate

prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

from langchain.chains import RetrievalQA


retrievalQA=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)


# Call the QA chain with our query.

query1 = "Describe the csv data?"
result = retrievalQA.invoke({"query": query1})
print(result['result'])


Use the following piece of context extracted from a CSV file to answer the question asked.
Please try to provide appropriate answer only based on the context and make sure they are correct.

CSV Context:
Age: 42
EstimatedSalary: 149000
Purchased: 1

Age: 40
EstimatedSalary: 71000
Purchased: 1

Age: 39
EstimatedSalary: 71000
Purchased: 0

Question:
Describe the csv data?

Helpful Answer:
The provided csv data includes information about individuals' ages, their estimated salaries, and whether or not they have purchased something. Specifically, the data set contains three rows of information about three different people. The first row indicates that the individual is 42 years old and has an estimated salary of 149,000. This person has made a purchase, indicated by the value of 1 in the 'Purchased' column. The second row shows that the individual is 40 years old, has an estimated salary of 71,000, and has made a purchase, as indicated by the value of 1 in the 'Purchased' column. The third

In [42]:
from langchain_community.llms import HuggingFaceHub

prompt_template = """
Use the following piece of context extracted from a CSV file to answer the question asked.
Please try to provide appropriate answer only based on the context and make sure they are correct.

CSV Context:
{context}

Question:
{question}

Helpful Answer:
"""

from langchain_core.prompts import PromptTemplate

prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

from langchain.chains import RetrievalQA


retrievalQA=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)


# Call the QA chain with our query.

query1 = "calculate the mean of estimatedsalary?"
result = retrievalQA.invoke({"query": query1})
print(result['result'])


Use the following piece of context extracted from a CSV file to answer the question asked.
Please try to provide appropriate answer only based on the context and make sure they are correct.

CSV Context:
Age: 20
EstimatedSalary: 82000
Purchased: 0

Age: 20
EstimatedSalary: 82000
Purchased: 0

Age: 19
EstimatedSalary: 19000
Purchased: 0

Question:
calculate the mean of estimatedsalary?

Helpful Answer:
The mean of estimatedsalary for the given context is calculated by summing up all the values of estimatedsalary (82000 + 82000 + 19000 = 183000) and then dividing this value by the number of data points available (which is 3). Therefore, the mean of estimatedsalary is 183000/3 = 61000.
