In [55]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
books = pd.read_csv("book.csv",encoding='latin1')
books

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [57]:
books['Book.Rating'].unique()

array([ 5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

In [58]:
books.shape

(10000, 4)

In [59]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   10000 non-null  int64 
 1   User.ID      10000 non-null  int64 
 2   Book.Title   10000 non-null  object
 3   Book.Rating  10000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 312.6+ KB


In [60]:
books.duplicated().sum()

0

In [61]:
books.describe()

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Rating
count,10000.0,10000.0,10000.0
mean,5000.5,95321.2498,7.5663
std,2886.89568,117645.703609,1.82152
min,1.0,8.0,1.0
25%,2500.75,2103.0,7.0
50%,5000.5,3757.0,8.0
75%,7500.25,162052.0,9.0
max,10000.0,278854.0,10.0


In [62]:
books = books.iloc[:,1:]
books.head(10)


Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6
5,276744,The Kitchen God's Wife,7
6,276745,What If?: The World's Foremost Military Histor...,10
7,276747,PLEADING GUILTY,9
8,276747,Under the Black Flag: The Romance and the Real...,9
9,276747,Where You'll Find Me: And Other Stories,8


In [70]:
books = books.rename({'User.ID':'userID','Book.Title':'title','Book.Rating':'rating'},axis=1)
books.head(10)

Unnamed: 0,userID,title,rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6
5,276744,The Kitchen God's Wife,7
6,276745,What If?: The World's Foremost Military Histor...,10
7,276747,PLEADING GUILTY,9
8,276747,Under the Black Flag: The Romance and the Real...,9
9,276747,Where You'll Find Me: And Other Stories,8


In [72]:
len(books)

10000

In [73]:
len(books.userID.unique())

2182

In [74]:
len(books.title.unique())

9659

In [76]:
print(books.userID.nunique(),books.title.nunique())

2182 9659


In [77]:
books.title.unique()

array(['Classical Mythology', 'Clara Callan', 'Decision in Normandy', ...,
       'How to Flirt: A Practical Guide', 'Twilight',
       'Kids Say the Darndest Things'], dtype=object)

In [78]:
books.userID.unique()

array([276726, 276729, 276736, ..., 162113, 162121, 162129], dtype=int64)

In [79]:
books.rating.unique()

array([ 5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

### Popularity Based

In [81]:
rating_count = books.groupby("title").count()['rating'].reset_index()
rating_count.rename(columns={'rating':'rating_count'},inplace=True)
rating_count.head()

Unnamed: 0,title,rating_count
0,"Jason, Madison &amp",1
1,Other Stories;Merril;1985;McClelland &amp,1
2,Repairing PC Drives &amp,1
3,'48,1
4,'O Au No Keia: Voices from Hawai'I's Mahu and ...,1


In [86]:
Avgrating_count = books.groupby('title').mean()['rating'].reset_index()
Avgrating_count.rename(columns={'rating':'Avg_rating'},inplace = True)
Avgrating_count.head()

Unnamed: 0,title,Avg_rating
0,"Jason, Madison &amp",8.0
1,Other Stories;Merril;1985;McClelland &amp,6.0
2,Repairing PC Drives &amp,4.0
3,'48,7.0
4,'O Au No Keia: Voices from Hawai'I's Mahu and ...,8.0


In [94]:
popular_df = rating_count.merge(Avgrating_count,on='title')
popular_df

Unnamed: 0,title,rating_count,Avg_rating
0,"Jason, Madison &amp",1,8.0
1,Other Stories;Merril;1985;McClelland &amp,1,6.0
2,Repairing PC Drives &amp,1,4.0
3,'48,1,7.0
4,'O Au No Keia: Voices from Hawai'I's Mahu and ...,1,8.0
...,...,...,...
9654,"\Surely You're Joking, Mr. Feynman!\: Adventur...",1,8.0
9655,"\Well, there's your problem\: Cartoons",1,9.0
9656,iI Paradiso Degli Orchi,1,7.0
9657,stardust,1,5.0


In [96]:
popular_df = popular_df.sort_values('Avg_rating',ascending=False).head(10)
popular_df

Unnamed: 0,title,rating_count,Avg_rating
2157,Dr Mommy (From Here To Maternity) (Silhouette ...,1,10.0
8082,The Pearl,1,10.0
5535,Pride and Prejudice (Everyman Paperback Classics),1,10.0
1184,CYCLOPS,1,10.0
2630,Fiesta! A Celebration of Latin Hospitality,1,10.0
2629,Fiddleback: A Novel,1,10.0
3277,Hollywood Speaks: Deafness and the Film Entert...,1,10.0
1205,Calliope's Sisters: A Comparative Study of Phi...,1,10.0
8074,The Passion of Molly T,1,10.0
3278,Holy Fire: A Novel (Bantam Spectra Book),1,10.0


### Collaborative System

In [99]:
final_df = books.pivot_table(index="title",columns="userID",values="rating")
final_df.fillna(0,inplace=True)
final_df

userID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Jason, Madison &amp",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Other Stories;Merril;1985;McClelland &amp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Repairing PC Drives &amp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\Well, there's your problem\: Cartoons",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iI Paradiso Degli Orchi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
stardust,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [104]:
similarity_score = cosine_similarity(final_df)
similarity_score.shape

(9659, 9659)

In [108]:
similarity_score

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [109]:
np.where(final_df.index=="Classical Mythology")[0][0]

1443

In [110]:
def recommend(book):
    book_index = np.where(final_df.index==book)[0][0]
    distance = similarity_score[book_index]
    book_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in book_list:
        print(final_df.index[i[0]])

In [111]:
recommend('Classical Mythology')

 Jason, Madison &amp
 Other Stories;Merril;1985;McClelland &amp
 Repairing PC Drives &amp
'48
'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities


In [112]:
recommend('The Pearl')

Brighton Beach Memoirs
Child of the Silent Night
Dolly Hit Me Back (Family Circus Series)
Sarah, Plain and Tall (Sarah, Plain and Tall)
The Pearl
