## Dowaload the dataset from kaggle

In [3]:
!pip install kaggle



In [4]:
from google.colab import files

uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle datasets download -d surajjha101/bigbasket-entire-product-list-28k-datapoints

Downloading bigbasket-entire-product-list-28k-datapoints.zip to /content
  0% 0.00/6.04M [00:00<?, ?B/s]
100% 6.04M/6.04M [00:00<00:00, 70.8MB/s]


In [7]:
!unzip -q /content/bigbasket-entire-product-list-28k-datapoints.zip -d /content/

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing

## Find out what is our data

In [2]:
df = pd.read_csv('/content/BigBasket Products.csv')

In [3]:
df.shape

(27555, 10)

In [4]:
df.columns

Index(['index', 'product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description'],
      dtype='object')

In [5]:
df.head(5)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


## Handle missing values

In [6]:
df.isnull().sum()

index              0
product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64

In [7]:
# Calculate the average rating for each product
average_ratings = df.groupby('product')['rating'].mean()

# Fill missing values with the average rating for each product
df['rating'] = df.apply(lambda row: average_ratings[row['product']] if pd.isnull(row['rating']) else row['rating'], axis=1)

# Fill remaining missing values with 2
df['rating'].fillna(2, inplace=True)

In [8]:
df['rating'].isnull().sum()

0

In [9]:
df = df.dropna(subset=['brand', 'product', 'description'])

In [10]:
df.isnull().sum()

index           0
product         0
category        0
sub_category    0
brand           0
sale_price      0
market_price    0
type            0
rating          0
description     0
dtype: int64

In [11]:
df.shape

(27439, 10)

In [12]:
df.duplicated().sum()

0

In [13]:
df.columns

Index(['index', 'product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description'],
      dtype='object')

## Text preprocessing

In [14]:
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [15]:
# Define a regex pattern to match URLs
url_pattern = re.compile(r'https?://\S+')

# Define a function to remove URLs from text
def remove_urls(text):
    return url_pattern.sub('', text)

# Apply the function to the 'text' column and create a new column 'clean_text'
df['description'] = df['description'].apply(remove_urls)

In [16]:
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)

In [17]:
df[['description', 'category', 'type', 'brand', 'sub_category', 'product']] = df[['description', 'category', 'type', 'brand', 'sub_category', 'product']].replace(to_replace=r'\d', value='', regex=True)

In [18]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
df['description'] = df['description'].apply(word_tokenize)

In [20]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
stop_words = set(stopwords.words('english'))
df['description'] = df['description'].apply(lambda x: [word for word in x if word not in stop_words])

In [22]:
df['description'] = df['description'].apply(lambda x: [word for word in x if word.isalnum()])

In [23]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# define function to lemmatize tokens
def lemmatize_tokens(tokens):
    # convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    # lemmatize tokens
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

    # return lemmatized tokens as a list
    return lemmas

# apply lemmatization function to column of dataframe
df['description'] = df['description'].apply(lemmatize_tokens)

In [25]:
df['description'] = df['description'].apply(lambda x: ' '.join(x))

# Feature extraction

In [26]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'])

# # Convert the TF-IDF matrix to a DataFrame for better handling
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# # Concatenate the TF-IDF features with the original DataFrame
# df = pd.concat([df, tfidf_df], axis=1)

In [27]:
tfidf_matrix_sparse = tfidf_matrix.tocsr()  # Convert to Compressed Sparse Row format

In [28]:
from sklearn.decomposition import TruncatedSVD

n_components = 5000  # Adjust as needed
svd = TruncatedSVD(n_components=n_components)

tfidf_matrix_svd = svd.fit_transform(tfidf_matrix_sparse)

In [29]:
from sklearn.model_selection import train_test_split

# Assuming tfidf_matrix_svd is your feature matrix
X_train, X_test = train_test_split(tfidf_matrix_svd, test_size=0.2, random_state=42)

In [30]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Calculate cosine similarity on the training set
# cosine_similarities = cosine_similarity(X_train)

# # Function to get top N recommendations for a given item
# def get_top_recommendations(item_index, similarity_matrix, n=5):
#     similar_items = list(enumerate(similarity_matrix[item_index]))
#     similar_items_sorted = sorted(similar_items, key=lambda x: x[1], reverse=True)
#     top_recommendations = similar_items_sorted[1:n+1]  # Exclude the item itself
#     return top_recommendations

# # Example: Get top 5 recommendations for the first item in the test set
# item_index = 0  # Adjust as needed
# top_recommendations = get_top_recommendations(item_index, cosine_similarities, n=5)
# print(top_recommendations)

In [31]:
# # Example: Evaluate precision and recall for the top 5 recommendations on the test set
# def evaluate_recommendations(X_test, similarity_matrix, n=5):
#     total_precision = 0
#     total_recall = 0

#     for item_index in range(len(X_test)):
#         top_recommendations = get_top_recommendations(item_index, similarity_matrix, n)
#         relevant_items = set(X_test[item_index].indices)  # Indices of nonzero elements in the TF-IDF matrix
#         recommended_items = set(rec[0] for rec in top_recommendations)

#         # Calculate precision and recall
#         precision = len(recommended_items.intersection(relevant_items)) / len(recommended_items) if recommended_items else 0
#         recall = len(recommended_items.intersection(relevant_items)) / len(relevant_items) if relevant_items else 0

#         total_precision += precision
#         total_recall += recall

#     avg_precision = total_precision / len(X_test)
#     avg_recall = total_recall / len(X_test)

#     return avg_precision, avg_recall

# # Example: Evaluate precision and recall for the top 5 recommendations on the test set
# precision, recall = evaluate_recommendations(X_test, cosine_similarities, n=5)
# print(f'Precision: {precision}, Recall: {recall}')


In [32]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity on the training set
cosine_similarities = cosine_similarity(X_train)

# Function to get top N recommendations for a given item
def get_top_recommendations(item_index, similarity_matrix, n=5):
    similar_items = list(enumerate(similarity_matrix[item_index]))
    similar_items_sorted = sorted(similar_items, key=lambda x: x[1], reverse=True)
    top_recommendations = similar_items_sorted[1:n+1]  # Exclude the item itself
    return top_recommendations

# Function to convert sparse matrix to set of indices
def sparse_to_set(sparse_matrix):
    if sparse_matrix.getnnz() == 0:
        return set()
    return set(sparse_matrix.indices)

# Example: Evaluate precision and recall for the top 5 recommendations on the test set
def evaluate_recommendations(X_test, similarity_matrix, n=5):
    total_precision = 0
    total_recall = 0

    for item_index in range(len(X_test)):
        top_recommendations = get_top_recommendations(item_index, similarity_matrix, n)

        if isinstance(X_test, np.ndarray):
            relevant_items = set(np.nonzero(X_test[item_index])[0])  # Indices of nonzero elements in the array
        else:  # Assuming it's a sparse matrix
            relevant_items = sparse_to_set(X_test[item_index])

        recommended_items = set(rec[0] for rec in top_recommendations)

        # Calculate precision and recall
        precision = len(recommended_items.intersection(relevant_items)) / len(recommended_items) if recommended_items else 0
        recall = len(recommended_items.intersection(relevant_items)) / len(relevant_items) if relevant_items else 0

        total_precision += precision
        total_recall += recall

    avg_precision = total_precision / len(X_test)
    avg_recall = total_recall / len(X_test)

    return avg_precision, avg_recall

# Example: Evaluate precision and recall for the top 5 recommendations on the test set
precision, recall = evaluate_recommendations(X_test, cosine_similarities, n=5)
print(f'Precision: {precision}, Recall: {recall}')


Precision: 0.26286443148689204, Recall: 0.0002628644314868653
