# BOOK RECOMMENDATION (COLLABORATIVE FILTERING)

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
groc = pd.read_csv("groceries_data.csv")


In [3]:
groc.head()

Unnamed: 0,Person,Item,Ratings
0,1,citrus fruit,9.0
1,1,semi-finished bread,7.0
2,1,margarine,8.0
3,1,ready soups,7.0
4,2,tropical fruit,9.0


In [5]:
print("groc.shape:",groc.shape)


groc.shape: (43367, 3)


In [6]:
groc.isnull().sum()

Person       0
Item         0
Ratings    114
dtype: int64

In [9]:
groc.fillna(0,inplace=True)
groc.isnull().sum()

Person     0
Item       0
Ratings    0
dtype: int64

In [10]:
groc.duplicated().sum()

0

## Popularity Based Recommendation

In [11]:
avg_rating_df=pd.DataFrame(groc.groupby('Item').mean()['Ratings'])
avg_rating_df.rename(columns={'Ratings':'Avg_Rating'},inplace=True)
avg_rating_df

Unnamed: 0_level_0,Avg_Rating
Item,Unnamed: 1_level_1
Instant food products,7.594937
UHT-milk,7.993921
abrasive cleaner,7.000000
artif. sweetener,7.437500
baby cosmetics,8.000000
...,...
white bread,7.995169
white wine,7.935829
whole milk,7.998408
yogurt,7.996356


In [12]:
num_ratings_df=pd.DataFrame(groc.groupby('Item').count()['Ratings'].reset_index())
num_ratings_df.rename(columns={'Ratings':'Num_Rating'},inplace=True)
num_ratings_df

Unnamed: 0,Item,Num_Rating
0,Instant food products,79
1,UHT-milk,329
2,abrasive cleaner,35
3,artif. sweetener,32
4,baby cosmetics,6
...,...,...
164,white bread,414
165,white wine,187
166,whole milk,2513
167,yogurt,1372


In [13]:
popular_df=num_ratings_df.merge(avg_rating_df,on='Item')
popular_df

Unnamed: 0,Item,Num_Rating,Avg_Rating
0,Instant food products,79,7.594937
1,UHT-milk,329,7.993921
2,abrasive cleaner,35,7.000000
3,artif. sweetener,32,7.437500
4,baby cosmetics,6,8.000000
...,...,...,...
164,white bread,414,7.995169
165,white wine,187,7.935829
166,whole milk,2513,7.998408
167,yogurt,1372,7.996356


In [15]:
popular_df=popular_df[popular_df['Num_Rating']>=250].sort_values('Avg_Rating',ascending=False)
popular_df=popular_df.head(50)
print(popular_df)
print(popular_df.shape)


                      Item  Num_Rating  Avg_Rating
31            citrus fruit         814    9.000000
10                 berries         327    9.000000
158         tropical fruit        1032    9.000000
124        root vegetables        1072    9.000000
110              pip fruit         744    9.000000
103       other vegetables        1903    9.000000
100                 onions         305    9.000000
65   fruit/vegetable juice         711    8.036568
35                  coffee         571    8.001751
166             whole milk        2513    7.998408
162     whipped/sour cream         705    7.997163
167                 yogurt        1372    7.996356
164            white bread         414    7.995169
1                 UHT-milk         329    7.993921
123             rolls/buns        1809    7.993919
16                  butter         545    7.990826
17             butter milk         275    7.989091
147    specialty chocolate         299    7.969900
40           cream cheese      

In [16]:
groc

Unnamed: 0,Person,Item,Ratings
0,1,citrus fruit,9.0
1,1,semi-finished bread,7.0
2,1,margarine,8.0
3,1,ready soups,7.0
4,2,tropical fruit,9.0
...,...,...,...
43362,9835,chicken,7.0
43363,9835,tropical fruit,9.0
43364,9835,other vegetables,9.0
43365,9835,vinegar,6.0


In [17]:
popular_df=popular_df.merge(groc,on='Item')
popular_df

Unnamed: 0,Item,Num_Rating,Avg_Rating,Person,Ratings
0,citrus fruit,814,9.0,1,9.0
1,citrus fruit,814,9.0,12,9.0
2,citrus fruit,814,9.0,40,9.0
3,citrus fruit,814,9.0,73,9.0
4,citrus fruit,814,9.0,77,9.0
...,...,...,...,...,...
32128,brown bread,638,6.0,9756,6.0
32129,brown bread,638,6.0,9767,6.0
32130,brown bread,638,6.0,9792,6.0
32131,brown bread,638,6.0,9801,6.0


In [18]:

popular_df=popular_df.drop_duplicates('Item')

print(popular_df.shape)

popular_df

(50, 5)


Unnamed: 0,Item,Num_Rating,Avg_Rating,Person,Ratings
0,citrus fruit,814,9.0,1,9.0
814,berries,327,9.0,34,9.0
1141,tropical fruit,1032,9.0,2,9.0
2173,root vegetables,1072,9.0,25,9.0
3245,pip fruit,744,9.0,4,9.0
3989,other vegetables,1903,9.0,5,9.0
5892,onions,305,9.0,83,9.0
6197,fruit/vegetable juice,711,8.036568,16,8.0
6908,coffee,571,8.001751,2,9.0
7479,whole milk,2513,7.998408,3,9.0


In [19]:
pop_df=popular_df[['Item','Num_Rating','Avg_Rating']]
pop_df

Unnamed: 0,Item,Num_Rating,Avg_Rating
0,citrus fruit,814,9.0
814,berries,327,9.0
1141,tropical fruit,1032,9.0
2173,root vegetables,1072,9.0
3245,pip fruit,744,9.0
3989,other vegetables,1903,9.0
5892,onions,305,9.0
6197,fruit/vegetable juice,711,8.036568
6908,coffee,571,8.001751
7479,whole milk,2513,7.998408


## Collaborative Filtering

In [20]:
groc

Unnamed: 0,Person,Item,Ratings
0,1,citrus fruit,9.0
1,1,semi-finished bread,7.0
2,1,margarine,8.0
3,1,ready soups,7.0
4,2,tropical fruit,9.0
...,...,...,...
43362,9835,chicken,7.0
43363,9835,tropical fruit,9.0
43364,9835,other vegetables,9.0
43365,9835,vinegar,6.0


In [22]:
groc.groupby('Person').count()['Ratings']

Person
1        4
2        3
3        1
4        4
5        4
        ..
9831    17
9832     1
9833    10
9834     4
9835     5
Name: Ratings, Length: 9835, dtype: int64

In [27]:
groc.groupby('Person').count()['Ratings']>15

Person
1       False
2       False
3       False
4       False
5       False
        ...  
9831     True
9832    False
9833    False
9834    False
9835    False
Name: Ratings, Length: 9835, dtype: bool

In [26]:
x=groc.groupby('Person').count()['Ratings']>15
print(x[x])
print(x[x].index)

Person
186     True
240     True
518     True
782     True
949     True
        ... 
9510    True
9524    True
9595    True
9793    True
9831    True
Name: Ratings, Length: 141, dtype: bool
Int64Index([ 186,  240,  518,  782,  949,  980,  981,  990,  991,  997,
            ...
            8958, 9002, 9092, 9116, 9236, 9510, 9524, 9595, 9793, 9831],
           dtype='int64', name='Person', length=141)


In [28]:
users_who_rated=x[x].index
#ratings_with_name['User-ID'].isin(users_who_rated)
filtered_ratings=groc[groc['Person'].isin(users_who_rated)]
filtered_ratings

Unnamed: 0,Person,Item,Ratings
700,186,frankfurter,8.0
701,186,ham,7.0
702,186,tropical fruit,9.0
703,186,pip fruit,9.0
704,186,root vegetables,9.0
...,...,...,...
43342,9831,red/blush wine,8.0
43343,9831,salty snack,7.0
43344,9831,chocolate,7.0
43345,9831,hygiene articles,7.0


In [30]:
y=filtered_ratings.groupby('Item').count()['Ratings']>=7
popular_groc=y[y].index
popular_groc

Index(['Instant food products', 'UHT-milk', 'abrasive cleaner',
       'baking powder', 'beef', 'berries', 'beverages', 'bottled beer',
       'bottled water', 'brown bread', 'butter', 'butter milk', 'cake bar',
       'candy', 'canned beer', 'canned fish', 'canned vegetables', 'cat food',
       'cereals', 'chewing gum', 'chicken', 'chocolate',
       'chocolate marshmallow', 'citrus fruit', 'coffee', 'cream cheese ',
       'curd', 'dessert', 'detergent', 'dish cleaner', 'dog food',
       'domestic eggs', 'female sanitary products', 'flour', 'frankfurter',
       'frozen dessert', 'frozen fish', 'frozen meals',
       'frozen potato products', 'frozen vegetables', 'fruit/vegetable juice',
       'grapes', 'ham', 'hamburger meat', 'hard cheese', 'herbs',
       'house keeping products', 'hygiene articles', 'ice cream', 'jam',
       'kitchen towels', 'long life bakery product', 'margarine', 'mayonnaise',
       'meat', 'misc. beverages', 'mustard', 'napkins', 'newspapers', 'oil',
   

In [31]:
final_ratings=filtered_ratings[filtered_ratings['Item'].isin(popular_groc)]
final_ratings

Unnamed: 0,Person,Item,Ratings
700,186,frankfurter,8.0
701,186,ham,7.0
702,186,tropical fruit,9.0
703,186,pip fruit,9.0
704,186,root vegetables,9.0
...,...,...,...
43341,9831,coffee,8.0
43343,9831,salty snack,7.0
43344,9831,chocolate,7.0
43345,9831,hygiene articles,7.0


In [32]:
final_ratings.drop_duplicates()

Unnamed: 0,Person,Item,Ratings
700,186,frankfurter,8.0
701,186,ham,7.0
702,186,tropical fruit,9.0
703,186,pip fruit,9.0
704,186,root vegetables,9.0
...,...,...,...
43341,9831,coffee,8.0
43343,9831,salty snack,7.0
43344,9831,chocolate,7.0
43345,9831,hygiene articles,7.0


In [33]:
pt=final_ratings.pivot_table(index='Item',columns='Person',values='Ratings')
pt

Person,186,240,518,782,949,980,981,990,991,997,...,8958,9002,9092,9116,9236,9510,9524,9595,9793,9831
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Instant food products,,,,,,,,,,,...,,,,,,,,,,
UHT-milk,,,,,,,8.0,,,,...,8.0,,,,,,,,,
abrasive cleaner,,,,,,,,,,,...,,,,,,,,,,
baking powder,,,,,6.0,,,6.0,,,...,,6.0,,,,,,,,
beef,,,8.0,,,,,,,,...,,,,,8.0,,,7.0,,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
waffles,6.0,,,,,,,6.0,,,...,,,,,6.0,6.0,,,,
whipped/sour cream,8.0,8.0,,,8.0,8.0,,8.0,8.0,8.0,...,8.0,,,,,,,8.0,8.0,8.0
white bread,8.0,,,,,,,8.0,,8.0,...,,,,8.0,,8.0,,,,
whole milk,8.0,8.0,8.0,8.0,8.0,8.0,,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,,,,8.0,8.0,8.0


In [34]:
pt.fillna(0,inplace=True)
pt

Person,186,240,518,782,949,980,981,990,991,997,...,8958,9002,9092,9116,9236,9510,9524,9595,9793,9831
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Instant food products,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UHT-milk,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abrasive cleaner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
baking powder,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6.0,0.0,0.0,...,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beef,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,7.0,0.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
waffles,6.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0
whipped/sour cream,8.0,8.0,0.0,0.0,8.0,8.0,0.0,8.0,8.0,8.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,8.0
white bread,8.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,8.0,...,0.0,0.0,0.0,8.0,0.0,8.0,0.0,0.0,0.0,0.0
whole milk,8.0,8.0,8.0,8.0,8.0,8.0,0.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,0.0,0.0,0.0,8.0,8.0,8.0


In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
similarity_scores=cosine_similarity(pt)
similarity_scores

array([[1.        , 0.        , 0.        , ..., 0.        , 0.14976057,
        0.21181177],
       [0.        , 1.        , 0.        , ..., 0.10497278, 0.06154575,
        0.17180203],
       [0.        , 0.        , 1.        , ..., 0.06579517, 0.15430335,
        0.12921915],
       ...,
       [0.        , 0.10497278, 0.06579517, ..., 1.        , 0.42640143,
        0.43643578],
       [0.14976057, 0.06154575, 0.15430335, ..., 0.42640143, 1.        ,
        0.65133895],
       [0.21181177, 0.17180203, 0.12921915, ..., 0.43643578, 0.65133895,
        1.        ]])

In [37]:
similarity_scores.shape

(94, 94)

In [38]:
#fetch index from grocsery
#np.where(pt.index=='tropical fruit')
print(np.where(pt.index=='tropical fruit')[0][0])



87


In [39]:
pt.index[87]

'tropical fruit'

In [40]:
#finding euclidean distance for 1984 book
#similarity_scores[0]

#list(enumerate(similarity_scores[0]))
similar_items=sorted(list(enumerate(similarity_scores[87])),key=lambda x:x[1],reverse=True)[1:6]
similar_items
#coz 0 is tropical fruit itself

[(92, 0.6374909222302114),
 (61, 0.6277576648248527),
 (93, 0.5909368402852787),
 (72, 0.5352643613280608),
 (65, 0.5197011503876874)]

In [39]:

'''
def recommend_books(book_name):
    x=np.where(pt.index==book_name)[0][0]
    print(x)
    similar_items=sorted(list(enumerate(similarity_scores[x])),key=lambda x:x[1],reverse=True)[1:6]
    
    for i in similar_items:
        print(pt.index[i[0]])   
    return 
'''

'\ndef recommend_books(book_name):\n    x=np.where(pt.index==book_name)[0][0]\n    print(x)\n    similar_items=sorted(list(enumerate(similarity_scores[x])),key=lambda x:x[1],reverse=True)[1:6]\n    \n    for i in similar_items:\n        print(pt.index[i[0]])   \n    return \n'

In [41]:
def recommend_books(groc_name):
    x=np.where(pt.index==groc_name)[0][0]
    print(x)
    similar_items=sorted(list(enumerate(similarity_scores[x])),key=lambda x:x[1],reverse=True)[1:6]
    
    data=[]
    for i in similar_items:
        item=[]
        temp_df=groc[groc['Item']==pt.index[i[0]]]   
        #print(temp_df.drop_duplicates('Book-Title')['Book-Author'])
        item.append(list(temp_df.drop_duplicates('Item')['Item'].values))

        data.append(item)
        
        
    return data

In [48]:
 recommend_books('tropical fruit')

87


[[['whole milk']],
 [['other vegetables']],
 [['yogurt']],
 [['root vegetables']],
 [['pip fruit']]]

In [49]:
pickle.dump(popular_df,open('popular_groc.pkl','wb'))

In [50]:
pickle.dump(pt,open('groc_pt.pkl','wb'))
pickle.dump(groc,open('groc.pkl','wb'))
pickle.dump(similarity_scores,open('similarity_scores.pkl','wb'))