#                               Importing Necessary Libraries

In [3]:
#Basic Libraries
import numpy as np
import pandas as pd

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Text Handling Libraries
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [4]:
df = pd.read_csv('/content/BigBasket Products.csv')

In [None]:
df.head()

# Data Loading and Cleaning

In [6]:
df.isnull().sum()

index              0
product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64

In [7]:
df.shape[0]

27555

In [8]:
#calculating parcentage of null value in each columun
print('Parcentage of null values column wise')
print('-'*60)
for col in df.columns:
    null_count=df[col].isnull().sum()
    total_count=df.shape[0]
    print("{}:{:.2f}".format(col,null_count/total_count*100))
    

Parcentage of null values column wise
------------------------------------------------------------
index:0.00
product:0.00
category:0.00
sub_category:0.00
brand:0.00
sale_price:0.00
market_price:0.00
type:0.00
rating:31.30
description:0.42


In [9]:
#calculating total null data
print('Parcentage of null data from total data:')
null_count=df.isnull().sum().sum()
total_count=np.product(df.shape)
print('{:.2f}'.format(null_count/total_count*100))



Parcentage of null data from total data:
3.17


So overall 3% data is missing but 31% of ratings are missing. Since we are going to create a recommender system, let's drop the null values as their will still be over 69% data for recommendation purposes which is enough for us.

In [10]:
df=df.dropna()

In [11]:
df.isnull().sum()

index           0
product         0
category        0
sub_category    0
brand           0
sale_price      0
market_price    0
type            0
rating          0
description     0
dtype: int64

In [12]:
df.shape

(18840, 10)

So even after dropping null data, 18000+ products are available for recommendation.
Let's recommend now!!

# Exploratory Data Analysis

In [13]:
df.head(2)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."


In [14]:
def bar_plot(enter_column):
    counts=df[enter_column].value_counts()
    category_count=pd.DataFrame({'Category':counts.index,'Counts':counts.values})[:10]
    return px.bar(data_frame=category_count,
       x='Category',
       y='Counts',
       color='Counts',
       color_continuous_scale='blues',
       text_auto=True,
       title='Count of Items in each '+ enter_column)

In [15]:
bar_plot('product')


In [16]:
bar_plot('brand')

In [17]:
bar_plot('category')

In [18]:
bar_plot('sub_category')

In [19]:
bar_plot('type')

# Demographic Filter Recommendor


Demographic Filtering is like recommeding items based on a feature. Like the top 10 rated items or the top 10 items in a particular category.

In [20]:
def sort_recommendor(col):
    
    """"A recommendor based on sorting products on the column passed.
    Arguments to be passed:
    
    col: The Feature to be used for recommendation.
    sort_type: True for Ascending Order"""
    
    rated_recommend=df.copy()
    
    if rated_recommend[col].dtype=='O':
        col='rating'
    rated_recommend=rated_recommend.sort_values(by=col,ascending=True)
    
    return rated_recommend[['product','brand','sale_price','rating']].head(10)
    

In [21]:
sort_recommendor('sale_price')

Unnamed: 0,product,brand,sale_price,rating
21312,Serum,Livon,3.0,2.5
18290,Sugar Coated Chocolate,Cadbury Gems,5.0,4.2
21228,Dish Shine Bar,Exo,5.0,4.2
14538,Cadbury Perk - Chocolate Bar,Cadbury,5.0,4.2
19538,Layer Cake - Chocolate,Winkies,5.0,4.2
2978,Sugar Free Chewing Gum - Mixed Fruit,Orbit,5.0,4.2
15926,Dreams Cup Cake - Choco,Elite,5.0,3.9
6014,Good Day Butter Cookies,Britannia,5.0,4.1
27413,Layer Cake - Orange,Winkies,5.0,4.1
11306,Happy Happy Choco-Chip Cookies,Parle,5.0,4.2


Notice that our top product has rating of 2.5 which is quite bad so let's filter down by setting a threshold rating.

In [22]:
df.rating.mean()

3.9430626326963902

So the average rating of products is 3.94 Let's use 3.5 as the threshold.

In [23]:
df.copy().loc[df.rating >= 3.5]

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.00,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.00,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.00,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...
6,7,Multani Mati,Beauty & Hygiene,Skin Care,Satinance,58.00,58.0,Face Care,3.6,Satinance multani matti is an excellent skin t...
7,8,Hand Sanitizer - 70% Alcohol Base,Beauty & Hygiene,Bath & Hand Wash,Bionova,250.00,250.0,Hand Wash & Sanitizers,4.0,70%Alcohol based is gentle of hand leaves skin...
...,...,...,...,...,...,...,...,...,...,...
27550,27551,"Wottagirl! Perfume Spray - Heaven, Classic",Beauty & Hygiene,Fragrances & Deos,Layerr,199.20,249.0,Perfume,3.9,Layerr brings you Wottagirl Classic fragrant b...
27551,27552,Rosemary,Gourmet & World Food,Cooking & Baking Needs,Puramate,67.50,75.0,"Herbs, Seasonings & Rubs",4.0,Puramate rosemary is enough to transform a dis...
27552,27553,Peri-Peri Sweet Potato Chips,Gourmet & World Food,"Snacks, Dry Fruits, Nuts",FabBox,200.00,200.0,Nachos & Chips,3.8,We have taken the richness of Sweet Potatoes (...
27553,27554,Green Tea - Pure Original,Beverages,Tea,Tetley,396.00,495.0,Tea Bags,4.2,"Tetley Green Tea with its refreshing pure, ori..."


In [24]:
def sort_recommendor(col='rating',sort_type = False):
    
    rated_recommend = df.copy().loc[df.rating >= 3.5]
    if rated_recommend[col].dtype == 'O':
        col='rating'
    rated_recommend = rated_recommend.sort_values(by=col,ascending = sort_type)
    return rated_recommend[['product','brand','sale_price','rating']].head(10)

In [25]:
sort_recommendor('sale_price')

Unnamed: 0,product,brand,sale_price,rating
1766,Good Girl Eau De Parfum For Women,Carolina Herrera,6660.0,5.0
12192,Man In Black Eau De Parfum,Bvlgari,6100.0,4.0
21637,"Casserole Set - Die-Cast, Induction Base, Granite",Wonderchef,6000.0,5.0
17767,Eau De Toilette For Men,Bentley,4905.0,5.0
13559,"Dog Food - Super Premium, Focus, Puppy, Limite...",Drools,4675.0,4.6
27538,Quista Pro Advanced Whey Protein Formula forti...,Himalaya,4500.0,4.0
2365,Pro Expert Nutrition Large Breed Puppy (3-18 M...,Pedigree,4480.0,5.0
6451,Extra Virgin Olive Oil,Pietro Coricelli,4400.0,4.7
22714,Olive Oil - Refined Pomace Mild,Basso,4399.0,5.0
17076,Olive Pomace Oil,Abbies,4399.0,4.5


Notice that the 2.5 rated product is not recommended now!! This was our first recommendor.
Quite easy yet effective and used a lot !!

# Content Based Recommendor


Let's try using other features such as Category, Sub Category, Brand, Type and Description for much better Recommendation.
We will be using NLP here to extract useful info from the features especially Description.

In [26]:
tfidf=TfidfVectorizer(stop_words='english')
tfidf_matrix=tfidf.fit_transform(df['description'])
tfidf_matrix.shape

(18840, 23342)

Now to compute the similarity score, let's use Linear_Kernel.
Linear Kernel which Calculates the Dot Product of the tfidf_matrix and returns an aggreate value depicting the Similarity score.


So we will be recommending items based on similarity score.

But our problem is that we will be getting back the similarity scores so we will be sorting the scores.

Now we need a reverse-map to get the title and that is what indices is for

In [30]:
indices=pd.Series(df.index, index=df['product']).drop_duplicates()
indices['Water Bottle - Orange']

1

In [32]:

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df.index, index=df['product']).drop_duplicates()

def get_recommendations_1(title, cosine_sim=cosine_sim):
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['product'].iloc[movie_indices]

In [33]:
get_recommendations_1('Cadbury Perk - Chocolate Bar')

17385                                Cashew Nuts - Salted
23126                        Nutrione - Baked Cashew Nuts
11962    Signature Roasted & Salted Cashew/Godambi - W240
23600                                             Cashews
11947                               Sunflower Seeds - Raw
8765                                     Chilli Nut Chaat
1986                         Whole Cashew/Godambi - Jumbo
2907                                      Cashew - Salted
21538            Salted Party Mix - Premium International
25887                               Broken Cashew/Godambi
Name: product, dtype: object


Our search was chocolate yet we got Cashews and Nuts recommended.


We need to optimize this based on category, sub_category and brand.


In [34]:
df.head(2)

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."



Notice that a product can be in multiple catergories and sub_categories and they are separated with a &.


Let's split them into a list for futher processes.

In [35]:
df1=df.copy()



In [36]:

lambda a:a.strip()#space removel
lambda a:list(map(lambda a:a.strip(),re.split('&|, |\*|\n')))

<function __main__.<lambda>(a)>

In [37]:
get_list=lambda a:list(map(lambda a:a.strip(),re.split('&|, |\*|\n',a)))

In [38]:
for i in ['category','sub_category','type']:
  df1[i]=df1[i].apply(get_list)

In [39]:
def cleaner(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

To avoid duplicacy, we will be converting everything to lowercase and also removing spaces between words.

This will ensure that our recommendor doesn't consider Chocolate of Cholocate IceCream and Chocolate Bar as the same.

We will now be joining the values of category, sub_category, type and brand

In [40]:
for i in ['category', 'sub_category','brand', 'type']:
  df1[i]=df1[i].apply(cleaner)

In [41]:
def couple(x):
    return ' '.join(x['category']) + ' ' + ' '.join(x['sub_category']) + ' '+x['brand']+' ' +' '.join( x['type'])
df1['tag'] = df1.apply(couple, axis=1)

In [42]:
df1=df1.drop(columns=['category','sub_category','brand','type'])

We need to Count the String Vectors and then compute the Cosine Similarity Score.



In [43]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df1['tag'])

We need to Count the String Vectors and then compute the Cosine Similarity Score.


In [44]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)


In [45]:
df1.head(2)

Unnamed: 0,index,product,sale_price,market_price,rating,description,tag
0,1,Garlic Oil - Vegetarian Capsule 500 mg,220.0,220.0,4.1,This Product contains Garlic Oil that is known...,beauty hygiene haircare srisriayurveda hairoil...
1,2,Water Bottle - Orange,180.0,180.0,2.3,"Each product is microwave safe (without lid), ...",kitchen garden pets storage accessories master...


In [58]:
df1= df1.reset_index()
indices = pd.Series(df1.index, index=df1['product'])

In [59]:
def get_recommendations_2(title, cosine_sim=cosine_sim):
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return df1['product'].iloc[movie_indices]

In [61]:
 get_recommendations_2('Water Bottle - Orange')

7721     Rectangular Plastic Container - With Lid, Mult...
7945                                Jar - With Lid, Yellow
18072     Round & Flat Storage Container - With lid, Green
4214     Premium Rectangular Plastic Container With Lid...
6506     Premium Round Plastic Container With Lid - Yellow
9546     Premium Rectangular Plastic Container With Lid...
13238    Premium Round & Flat Storage Container With Li...
16581      Premium Round Plastic Container With Lid - Blue
17810    Premium Round Plastic Container With Lid - Mul...
17816      Premium Round Plastic Container With Lid - Pink
Name: product, dtype: object


Comparing Old and New Recommedations


In [62]:
old_rec = get_recommendations_1('Water Bottle - Orange').values
new_rec = get_recommendations_2('Water Bottle - Orange', cosine_sim2).values

pd.DataFrame({'Old Recommendor': old_rec,'New Recommendor':new_rec})

Unnamed: 0,Old Recommendor,New Recommendor
0,"Rectangular Plastic Container - With Lid, Mult...",Glass Water Bottle - Aquaria Organic Purple
1,"Jar - With Lid, Yellow",Glass Water Bottle With Round Base - Transpare...
2,"Round & Flat Storage Container - With lid, Green",H2O Unbreakable Water Bottle - Pink
3,Premium Rectangular Plastic Container With Lid...,Water Bottle H2O Purple
4,Premium Round Plastic Container With Lid - Yellow,H2O Unbreakable Water Bottle - Green
5,Premium Rectangular Plastic Container With Lid...,Regel Tritan Plastic Sports Water Bottle - Black
6,Premium Round & Flat Storage Container With Li...,Apsara 1 Water Bottle - Assorted Colour
7,Premium Round Plastic Container With Lid - Blue,"Glass Water Bottle With Round Base - Yellow, B..."
8,Premium Round Plastic Container With Lid - Mul...,Trendy Stainless Steel Bottle With Steel Cap -...
9,Premium Round Plastic Container With Lid - Pink,"Penta Plastic Pet Water Bottle - Violet, Wide ..."



Our new recommendation are much better compared to the old ones.