##  Building a recommender system by using cosine similarties score for the books data


In [1]:
import pandas as pd
import numpy as np

In [2]:
book = pd.read_excel('book.xls')

### Renaming to understand the index values better

In [3]:
book1= book.rename({'Unnamed: 0': 'userid', 'User.ID': 'ID', 'Book.Title': 'Title', 'Book.Rating': 'Rating'}, axis = 1)
book1

Unnamed: 0,userid,ID,Title,Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [4]:
book1.sort_values('userid')

Unnamed: 0,userid,ID,Title,Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [5]:
book1.sort_values('userid')

Unnamed: 0,userid,ID,Title,Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [6]:
len(book1.userid.unique())

10000

In [7]:
len(book1.ID.unique())

2182

In [8]:
book1['Rating'].value_counts().sort_index()
#we can see the rating values for different titles across the data

1       43
2       63
3      146
4      237
5     1007
6      920
7     2076
8     2283
9     1493
10    1732
Name: Rating, dtype: int64

In [9]:
book1['Title'].value_counts()
#the total rating the different titles have received

Fahrenheit 451                                              5
Charlie and the Chocolate Factory                           4
The Subtle Knife (His Dark Materials, Book 2)               4
Vanished                                                    4
Ender's Game (Ender Wiggins Saga (Paperback))               4
                                                           ..
Murder on St. Mark's Place (Gaslight Mysteries)             1
State of Grace                                              1
Valsalva's Maneuver: Mots Justes and Indispensable Terms    1
I love you, I hate you                                      1
Kids Say the Darndest Things                                1
Name: Title, Length: 9659, dtype: int64

In [10]:
book1_df= book1.pivot(index='userid', columns='Title', values='Rating')
book1_df
# this will give us the relationship between various users as per their likelihood of liking a title or not

Title,Classical Mythology,Clara Callan,Decision in Normandy,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,The Mummies of Urumchi,The Kitchen God's Wife,What If?: The World's Foremost Military Historians Imagine What Might Have Been,PLEADING GUILTY,Under the Black Flag: The Romance and the Reality of Life Among the Pirates,Where You'll Find Me: And Other Stories,...,The Cloister Walk,Open Water,The Evolution of Jane,AT PARADISE GATE,I Should Have Stayed Home: The Worst Trips of the Great Writers (Travel Literature Series),American Fried: Adventures of a Happy Eater.,Cannibal In Manhattan,How to Flirt: A Practical Guide,Twilight,Kids Say the Darndest Things
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,3.0,,,,,,,,,...,,,,,,,,,,
3,,,6.0,,,,,,,,...,,,,,,,,,,
4,,,,8.0,,,,,,,...,,,,,,,,,,
5,,,,,6.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,,,,,,,,,,,...,,,,,,7.0,,,,
9997,,,,,,,,,,,...,,,,,,,9.0,,,
9998,,,,,,,,,,,...,,,,,,,,7.0,,
9999,,,,,,,,,,,...,,,,,,,,,8.0,


In [None]:
book1_df.fillna(0, inplace=True)
book1_df
# checking the similarity between various users, their rating for different titles and replacing nan with 0

In [None]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [None]:
user_sim = 1-pairwise_distances(book1_df.values, metric='cosine')
user_sim

In [None]:
user_sim_df = pd.DataFrame(user_sim)
user_sim_df

In [None]:
user_sim_df.index = book1.userid.unique()
user_sim_df.columns = book1.userid.unique()
user_sim_df.iloc[0:10, 0:10]

In [None]:
np.fill_diagonal(user_sim, 0)
user_sim_df.idxmax(axis=1)[0:10]
#we can see. ost similar users

In [None]:
book1[(book1['userid']==6) | (book1['userid']==1) | (book1['userid']==10)| (book1['userid']==4)]  

In [None]:
# there is not much similarity between various users and hence we cannot conclude much.
# however, we can surely say that various users are similar to userid 6 and can recommend a few titles basis the ratings by common users