### Importing the Libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

### Importing the Data:

In [193]:
data = pd.read_csv("book.csv", encoding="ISO-8859-1")
data

Unnamed: 0.1,Unnamed: 0,User.ID,Book.Title,Book.Rating
0,1,276726,Classical Mythology,5
1,2,276729,Clara Callan,3
2,3,276729,Decision in Normandy,6
3,4,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,5,276737,The Mummies of Urumchi,6
...,...,...,...,...
9995,9996,162121,American Fried: Adventures of a Happy Eater.,7
9996,9997,162121,Cannibal In Manhattan,9
9997,9998,162121,How to Flirt: A Practical Guide,7
9998,9999,162121,Twilight,8


In [194]:
data = data.iloc[:,1:]

In [195]:
data

Unnamed: 0,User.ID,Book.Title,Book.Rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6
...,...,...,...
9995,162121,American Fried: Adventures of a Happy Eater.,7
9996,162121,Cannibal In Manhattan,9
9997,162121,How to Flirt: A Practical Guide,7
9998,162121,Twilight,8


### EDA:

In [112]:
data.isnull().sum()

User.ID        0
Book.Title     0
Book.Rating    0
dtype: int64

In [4]:
len(data["User.ID"].unique())

2182

In [5]:
len(data["Book.Title"].unique())

9659

### Making a Pivot table of each user with the different books they rated:

In [196]:
df = data.pivot_table(values="Book.Rating", index="User.ID", columns="Book.Title", fill_value=0)

In [197]:
df

Book.Title,"Jason, Madison &amp",Other Stories;Merril;1985;McClelland &amp,Repairing PC Drives &amp,'48,'O Au No Keia: Voices from Hawai'I's Mahu and Transgender Communities,...AND THE HORSE HE RODE IN ON : THE PEOPLE V. KENNETH STARR,01-01-00: A Novel of the Millennium,"1,401 More Things That P*Ss Me Off",10 Commandments Of Dating,"100 Great Fantasy Short, Short Stories",...,Zora Hurston and the Chinaberry Tree (Reading Rainbow Book),\Even Monkeys Fall from Trees\ and Other Japanese Proverbs,\I Won't Learn from You\: And Other Thoughts on Creative Maladjustment,"\More More More,\ Said the Baby",\O\ Is for Outlaw,"\Surely You're Joking, Mr. Feynman!\: Adventures of a Curious Character","\Well, there's your problem\: Cartoons",iI Paradiso Degli Orchi,stardust,Ã?Â?bermorgen.
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278851,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7,0,0
278852,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Building the Recommendation using Cosine similarity:

In [198]:
similar_users = 1 - pairwise_distances(df.values,metric='cosine')
similar_users

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [199]:
sim_users_df = pd.DataFrame(similar_users)
sim_users_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2172,2173,2174,2175,2176,2177,2178,2179,2180,2181
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [200]:
sim_users_df.index = df.index
sim_users_df.columns = df.index
sim_users_df

User.ID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [201]:
# Filling the diagonal with 0's so that the model doesnt recommend each user with themselves:
np.fill_diagonal(similar_users, 0)
sim_users_df

User.ID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278852,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [202]:
# Replacing all the 0 with NaN so that we can skip NaN in the next step
sim_users_df = sim_users_df.replace(0,np.nan)
sim_users_df

User.ID,8,9,10,12,14,16,17,19,22,26,...,278831,278832,278836,278843,278844,278846,278849,278851,278852,278854
User.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278846,,,,,,,,,,,...,,,,,,,,,,
278849,,,,,,,,,,,...,,,,,,,,,,
278851,,,,,,,,,,,...,,,,,,,,,,
278852,,,,,,,,,,,...,,,,,,,,,,


In [203]:
# Getting the users with Max similarity
similarity = pd.DataFrame(sim_users_df.idxmax(axis=1,skipna=True))
similarity

Unnamed: 0_level_0,0
User.ID,Unnamed: 1_level_1
8,
9,
10,
12,
14,
...,...
278846,
278849,
278851,278202.0
278852,


### Users with Maximum similarity:

In [204]:
similarity.dropna()

Unnamed: 0_level_0,0
User.ID,Unnamed: 1_level_1
19,278418.0
51,3757.0
53,1996.0
82,882.0
83,276861.0
...,...
278694,161831.0
278715,277710.0
278843,277959.0
278844,276813.0


### Checking if our model is working correctly:

In [225]:
data[(data['User.ID']==278851) | (data['User.ID']==278202.0)]

Unnamed: 0,User.ID,Book.Title,Book.Rating
1706,278202,The Florabama Ladies' Auxiliary &amp; Sewing C...,3
1707,278202,Howl and Other Poems (Pocket Poets),2
1708,278202,"Frankenstein: Complete, Authoritative Text Wit...",4
1709,278202,Short Stories of Ernest Hemingway (A Scribner ...,5
1710,278202,Back When We Were Grownups : A Novel (Ballanti...,3
1711,278202,To Kill a Mockingbird,9
1712,278202,Birds Bees And Babies 1994,3
1713,278202,The Cat Who Played Brahms,10
1714,278202,Black Notice,3
1715,278202,Irish Eyes: A Nuala Anne McGrail Novel (Nuala ...,5


##### Observation: We can see that both users that our model said were similar have given rating to the book titled Black Notice. Therefore we can conclude that the model is finding a relation between the users using the rating.

In [210]:
user_1=data[data['User.ID']==278851]
user_2=data[data['User.ID']==278202.0]
pd.merge(user_1,user_2,on='Book.Title',how='outer')

Unnamed: 0,User.ID_x,Book.Title,Book.Rating_x,User.ID_y,Book.Rating_y
0,278851.0,So You Want to Be a Stay-At-Home Mom,5.0,,
1,278851.0,Black Notice,5.0,278202.0,3.0
2,278851.0,Mystic River,5.0,,
3,278851.0,Hitched,7.0,,
4,278851.0,Fantastic Imagination,5.0,,
5,278851.0,Six of Swords,5.0,,
6,278851.0,Paradise Wild,7.0,,
7,278851.0,iI Paradiso Degli Orchi,7.0,,
8,278851.0,The Celestine Prophecy : An Experiential Guide,5.0,,
9,278851.0,Huis Clos Suivi de Les Mouches (Folio Ser. No....,7.0,,


###### We can now recomend all the other books that these 2 users have read to each other. This can be done for all 368 pairs of similar users found above

### Checking the similarity of the book based on the rating:

In [262]:
# Function to find simlarity with other books based on ratings
def book_similarity(data, book_title):
    rating = data[book_title]
    similar = data.corrwith(rating)
    corr_df = pd.DataFrame(similar, columns=['Correlation'])
    corr_df.dropna(inplace=True)
    print(f'---Top 5 similar books for book "{book_title}" are:')
    print(corr_df[(corr_df['Correlation'] > 0) & (corr_df.index != book_title)].sort_values(by='Correlation', ascending=False).head(5))
    print("____________________\n")

In [245]:
# Top 10 books that have been rated by most users
data["Book.Title"].value_counts().head(10)

Fahrenheit 451                                     5
Ender's Game (Ender Wiggins Saga (Paperback))      4
The Subtle Knife (His Dark Materials, Book 2)      4
Charlie and the Chocolate Factory                  4
The Amber Spyglass (His Dark Materials, Book 3)    4
Stardust                                           4
Vanished                                           4
The Face                                           3
High Fidelity                                      3
The Golden Compass (His Dark Materials, Book 1)    3
Name: Book.Title, dtype: int64

In [246]:
famous_books = data["Book.Title"].value_counts().head(10).keys()

#### Here are the top 5 books similar to the top 10 famous books from this dataset:

In [263]:
for book in famous_books:
    book_similarity(df, book)

---Top 5 similar books for book "Fahrenheit 451" are:
                                                    Correlation
Book.Title                                                     
The Day I Swapped My Dad for 2 Goldfish                0.596326
Marriage By Contract  (36 Hours) (Harlequin 36 ...     0.596326
For The Love Of Beau (Delta Justice) (Delta Jus...     0.596326
Julia (Circle Of Friends) (Harlequin Superroman...     0.596326
Silent Witness                                         0.596326
____________________

---Top 5 similar books for book "Ender's Game (Ender Wiggins Saga (Paperback))" are:
                                                 Correlation
Book.Title                                                  
The Brothers Karamazov (Vintage Classics)           0.626405
Song of Solomon (Oprah's Book Club (Paperback))     0.626405
Come Before Winter and Share My Hope                0.547992
Non-manipulative selling                            0.547992
The Bear and the Dragon  

##### We can now recommend books that are highly correlated to those people who are buying any of the books in the dataset. 