<a href="https://colab.research.google.com/github/AdityaIITDelhi/PIAI22JUN3001.github.io/blob/main/Chaabi_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **BigBasket Product Query Engine  Notebook 1**
**It contains**
1. smooth data transformation
2. extracted important meta data,  encoding done using sentence encoder and stored in dict object along with meta data  
3. returns space.pkl which will use to store context in vector db(qdrant) file containg vector representation of context and important meta data

In [4]:
!pip install -q qdrant-client
!pip install -q sentence-transformers
!pip install -q torch
!pip install -q requests  # Adding requests library for URL fetching

import torch
import pandas as pd
import sys
import json
import warnings
import requests  # Importing requests library for fetching data
from io import StringIO
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models
from tqdm.auto import tqdm
from typing import List

# URL for the CSV data
CSV_URL = 'https://chaabiv2.s3.ap-south-1.amazonaws.com/hiring/bigBasketProducts.csv'

# Fetching data from the URL
response = requests.get(CSV_URL)

if response.status_code == 200:
    # Read data into a pandas DataFrame
    csv_data = StringIO(response.text)
    df = pd.read_csv(csv_data).drop(columns=["index"])
    print("Data fetched successfully.")
    print(df.head())  # Printing the first few rows of the DataFrame to verify the data

    # Checking for GPU availability for torch
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        retriever = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1", device=device)

Data fetched successfully.
                                             product                category  \
0             Garlic Oil - Vegetarian Capsule 500 mg        Beauty & Hygiene   
1                              Water Bottle - Orange  Kitchen, Garden & Pets   
2                     Brass Angle Deep - Plain, No.2    Cleaning & Household   
3  Cereal Flip Lid Container/Storage Jar - Assort...    Cleaning & Household   
4                 Creme Soft Soap - For Hands & Body        Beauty & Hygiene   

            sub_category              brand  sale_price  market_price  \
0              Hair Care  Sri Sri Ayurveda        220.0         220.0   
1  Storage & Accessories         Mastercook       180.0         180.0   
2            Pooja Needs                Trm       119.0         250.0   
3   Bins & Bathroom Ware             Nakoda       149.0         176.0   
4       Bath & Hand Wash              Nivea       162.0         162.0   

                       type  rating  \
0          Hai

.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
class make_embedding_ds(torch.utils.data.IterableDataset):
    def __init__(self, csv = df):
        """
        will compute embedding using sentence encoder

        """
        super(make_embedding_ds).__init__()
        self.csv = csv
        self.total_row = csv.shape[0]
        self.col = csv.columns.to_list()

    def __iter__(self):
        for row_no in range(self.total_row):

            #testing: make comment it later



            row = self.csv.iloc[row_no].to_dict()
            product_name = row[self.col[0]]
            story = f"{row[ self.col[0] ]} is of category {row[self.col[1]]} and sub category is {row[self.col[2]]}. {row[self.col[0]]} is type {row[self.col[6]]}. brand of {row[self.col[0]]} is {row[self.col[3]]}, with rating {row[self.col[7]]}. sale price of {row[self.col[0]]} is {row[self.col[4]]} with market price {row[self.col[5]]}, description of {row[self.col[0]]} is {row[self.col[8]]}"
            emb = retriever.encode(story)


            yield row_no,product_name,story,emb


ds = make_embedding_ds(df)


dataloader = torch.utils.data.DataLoader(ds, num_workers=1,batch_size=128)


In [6]:
json_emb = {
    "payload":[],
    "emb": []
}
batch_no = 1
for row,product_name,story,emb in dataloader:

    _batch_len = len(row)



    print(f"batch no-{batch_no} competed")
    batch_no += 1

    for index in range(_batch_len):

        temp = {}
        temp["id"] = row[index].item()
        temp["product"] = product_name[index]
        temp["story"] = story[index]

        json_emb["payload"].append(temp)
        json_emb["emb"].append(emb[index].numpy().tolist())




#     json_emb["payload"].append(batch[0])
#     json_emb["emb"].append(batch[1][0])




batch no-1 competed
batch no-2 competed
batch no-3 competed
batch no-4 competed
batch no-5 competed
batch no-6 competed
batch no-7 competed
batch no-8 competed
batch no-9 competed
batch no-10 competed
batch no-11 competed
batch no-12 competed
batch no-13 competed
batch no-14 competed
batch no-15 competed
batch no-16 competed
batch no-17 competed
batch no-18 competed
batch no-19 competed
batch no-20 competed
batch no-21 competed
batch no-22 competed
batch no-23 competed
batch no-24 competed
batch no-25 competed
batch no-26 competed
batch no-27 competed
batch no-28 competed
batch no-29 competed
batch no-30 competed
batch no-31 competed
batch no-32 competed
batch no-33 competed
batch no-34 competed
batch no-35 competed
batch no-36 competed
batch no-37 competed
batch no-38 competed
batch no-39 competed
batch no-40 competed
batch no-41 competed
batch no-42 competed
batch no-43 competed
batch no-44 competed
batch no-45 competed
batch no-46 competed
batch no-47 competed
batch no-48 competed
b

In [7]:
sys.getsizeof(json_emb)/1024

0.2265625

In [8]:
import pickle

# Deserialize the data from the pickle file
with open('space.pkl', 'wb') as file:
    pickle.dump(json_emb, file)

In [15]:
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data).drop(columns=["index"])
print("Data fetched successfully.")
print(df.head())

VECTOR_SPACE_PATH = "/content/space.pkl"
with open(VECTOR_SPACE_PATH, 'rb') as file:
  vs = pickle.load(file)

client = QdrantClient(":memory:")
collection_name = "qdrant-space"


total_records = len(vs["payload"])  # Total records data
_payload = vs["payload"]
_emb = vs["emb"]
ids = list(range(0, total_records))

batch_size = 2  # Specify batch size according to your RAM and compute, higher batch size = more RAM usage
client.recreate_collection(
    collection_name="qdrant-space",
    vectors_config=models.VectorParams(
        size=384,
        distance=models.Distance.COSINE,
    ),
)

client.upsert(
    collection_name=collection_name,
    points=models.Batch(ids=ids, vectors=_emb, payloads=_payload),
)

collection_vector_count = client.get_collection(collection_name=collection_name).vectors_count
print(f"Vector count in collection: {collection_vector_count}")

# Dump Qdrant space
with open('qdrant_space_client.pkl', 'wb') as file:
  pickle.dump(client, file)

if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  retriever = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1", device=device)

    # Dump Sentence Transformer model
with open('encodermodel.pkl', 'wb') as file:
  pickle.dump(retriever, file)

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

    # Load the reader model into a question-answering pipeline
reader = pipeline("question-answering", model=model_name, tokenizer=model_name)

    # Dump BERT model
with open('bert-question-answering.pkl', 'wb') as file:
  pickle.dump(reader, file)


Data fetched successfully.
                                             product                category  \
0             Garlic Oil - Vegetarian Capsule 500 mg        Beauty & Hygiene   
1                              Water Bottle - Orange  Kitchen, Garden & Pets   
2                     Brass Angle Deep - Plain, No.2    Cleaning & Household   
3  Cereal Flip Lid Container/Storage Jar - Assort...    Cleaning & Household   
4                 Creme Soft Soap - For Hands & Body        Beauty & Hygiene   

            sub_category              brand  sale_price  market_price  \
0              Hair Care  Sri Sri Ayurveda        220.0         220.0   
1  Storage & Accessories         Mastercook       180.0         180.0   
2            Pooja Needs                Trm       119.0         250.0   
3   Bins & Bathroom Ware             Nakoda       149.0         176.0   
4       Bath & Hand Wash              Nivea       162.0         162.0   

                       type  rating  \
0          Hai

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
with open('qdrant_space_client.pkl', 'wb') as file:
        pickle.dump(client, file)

if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  retriever = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1", device=device)

    # Dump Sentence Transformer model
with open('encodermodel.pkl', 'wb') as file:
  pickle.dump(retriever, file)

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"

    # Load the reader model into a question-answering pipeline
reader = pipeline("question-answering", model=model_name, tokenizer=model_name)

    # Dump BERT model
with open('bert-question-answering.pkl', 'wb') as file:
  pickle.dump(reader, file)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
!pip install -q qdrant-client
!pip install -q sentence-transformers

import pickle
from sentence_transformers import SentenceTransformer

# Replace these paths with your actual file paths
BERT_MODEL_PATH = "/content/bert-question-answering.pkl"
ENC_PATH = "/content/encodermodel.pkl"
QDRANT_PATH = "/content/qdrant_space_client.pkl"

# Load the saved models and Qdrant client
with open(BERT_MODEL_PATH, 'rb') as file:
    bert = pickle.load(file)

with open(QDRANT_PATH, 'rb') as file:
    qdrant_client = pickle.load(file)

with open(ENC_PATH, 'rb') as file:
    st_encoder = pickle.load(file)

collection_name = "qdrant-space"

def find_close_contexts(question: str, top_k: int) -> list:
    """
    Returns contexts close to the query.

    Args:
        question (str): Query string.
        top_k (int): Number of top results to fetch.

    Returns:
        List: Contexts close to the query.
    """
    try:
        encoded_query = st_encoder.encode(question).tolist()
        result = qdrant_client.search(
            collection_name=collection_name,
            query_vector=encoded_query,
            limit=top_k,
        )

        context = [
            [context.payload["product"], context.payload["story"]] for context in result
        ]
        return context
    except Exception as e:
        print({e})

def tell_me(question: str, context: list):
    """
    Extracts the answer from the context for a given question.

    Args:
        question (str): Question string.
        context (list): List of context strings.
    """
    results = []
    for c in context:
        answer = bert(question=question, context=c[1])
        answer["product"] = c[0]
        results.append(answer)
        print()

    sorted_result = sorted(results, key=lambda x: x["score"], reverse=True)
    for i in range(len(sorted_result)):
        _out = sorted_result[i]["answer"]
        _prod = sorted_result[i]["product"]
        _sco = sorted_result[i]["score"]
        print(f"QUERY INPUT: {question}")
        print(f"OUTPUT: {_out} \nPREDICTION SCORE {_sco}\n\nReferred Product: {_prod}\n\n")
        return question, _out, _sco, _prod

queries = [
    "What are the recommended products for vegetable cleaning?",
    "How is the rating of the product Vegetable & Fruit Wash with 100% Natural Action?",
    "Which beauty product is most popular?",
    "What is the price of Dove soap?",
    "Which beauty product is most favored?",
    "Suggest one tea product."
]

with open("result.txt", "w") as result:
    for q in queries:
        c = find_close_contexts(q, top_k=1)
        _ques, _out, _sco, _prod = tell_me(q, c)
        result.write(
            f"QUERY INPUT: {_ques}\nOUTPUT: {_out} \nPREDICTION SCORE {_sco}\n\nReferred Product: {_prod}\n\n"
        )



QUERY INPUT: What are the recommended products for vegetable cleaning?
OUTPUT: 100% Natural Action and Neem & Citrus Fruit extracts 
PREDICTION SCORE 0.003462602384388447

Referred Product: Vegetable & Fruit Wash Spray with 100% Natural Action



QUERY INPUT: How is the rating of the product Vegetable & Fruit Wash with 100% Natural Action?
OUTPUT: 4.3 
PREDICTION SCORE 0.9656943678855896

Referred Product: Vegetable & Fruit Wash with 100% Natural Action



QUERY INPUT: Which beauty product is most popular?
OUTPUT: Brightening Beauty Pack 
PREDICTION SCORE 0.23984676599502563

Referred Product: Brightening Beauty Pack



QUERY INPUT: What is the price of Dove soap?
OUTPUT: 699.0 
PREDICTION SCORE 0.6672144532203674

Referred Product: Lavender Soap Spa Set



QUERY INPUT: Which beauty product is most favored?
OUTPUT: Deodorant Body Spray 
PREDICTION SCORE 0.14602375030517578

Referred Product: Deodorant Body Spray - Be Delicious Woman EDT



QUERY INPUT: Suggest one tea product.
OUTPUT: