In [1]:
import pandas as pd
from pathlib import Path
import os
import numpy as np
from tqdm.notebook import tqdm
from heapq import nlargest
#from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [15]:
df_train= pd.read_csv('trainwithzerostopredict.csv')
df_topredict= pd.read_csv('topredict.csv')

In [87]:
df_train

Unnamed: 0,User-ID,ISBN,Book-Rating
0,8,0002005018,5
1,8,1881320189,7
2,8,1575663937,6
3,8,074322678X,5
4,8,1552041778,5
...,...,...,...
995,850,3426701448,8
996,850,3499237016,9
997,850,3746611229,3
998,850,3893850589,7


In [65]:
df_topredict

Unnamed: 0,User-ID,ISBN,Book-Rating
0,17,0891075275,6
1,56,0679865691,9
2,114,0312953453,7
3,160,9727110843,8
4,160,9728579225,8
...,...,...,...
76772,278851,0843106743,7
76773,278851,067161746X,7
76774,278851,0439050006,5
76775,278851,1558531025,8


In [67]:
df_usr_whoratedtest= df_train[df_train['ISBN'].isin(df_topredict['ISBN'])]
df_usr_whoratedtest

Unnamed: 0,User-ID,ISBN,Book-Rating
0,8,0002005018,5
7,9,0452264464,6
13,16,0345402871,9
14,17,0891076182,3
15,17,0891075275,0
...,...,...,...
985,850,3499425394,0
987,850,3934254454,0
988,850,3453171977,0
989,850,3518371002,0


In [88]:
df_usr_whoratedtest_count= df_usr_whoratedtest.groupby('ISBN').agg(Num_ratings=('Book-Rating','count')).reset_index()
df_usr_whoratedtest_count.sort_values(by='Num_ratings', ascending=False).head(30)

Unnamed: 0,ISBN,Num_ratings
88,0316769487,3
23,0060976845,3
342,0679781587,2
60,0142001740,2
174,039914739X,2
26,0060977493,2
206,044023722X,2
310,0671021001,2
82,0316666343,2
89,0316776963,2


## Unfortunately there are only 15 books in the test dataset which have been rated more than once in the whole dataset. </br>
## As a result we cannot compare the result of cosine similarity with matrix factorization in the test dataset.
## For future work we can split the train and test in such a way that we can compare the mean squared errors from matrix factorization and user-user collaborative filtering

In [69]:
df_sample= df_train[df_train['User-ID'].isin(df_usr_whoratedtest['User-ID'])]
df_sample

Unnamed: 0,User-ID,ISBN,Book-Rating
0,8,0002005018,5
1,8,1881320189,7
2,8,1575663937,6
3,8,074322678X,5
4,8,1552041778,5
...,...,...,...
995,850,3426701448,8
996,850,3499237016,9
997,850,3746611229,3
998,850,3893850589,7


In [70]:
user_book_mat= df_sample.pivot(index='User-ID',columns='ISBN',values='Book-Rating' ).fillna(0)

In [71]:
user_book_mat

ISBN,0002005018,0006510345,0007100221,0007123817,0020427859,0026329859,0030096189,0060006048,0060083298,0060164662,...,9728515227,9728579225,9729720495,972975831X,9729817715,9729852405,9748440537,9770390107900,9780590962735,9781562477547
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
828,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
def rating(user_id, isbn, k): #k i number of neighbors in KNN
    model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=k, n_jobs=-1) # fit the dataset
    
    def make_recommendation(model_knn, userId, user_book):
        user_idx= user_book.loc[userId]
        model_knn.fit(user_book)
        distances, indices = model_knn.kneighbors([user_idx])
        for i in indices:
            user_list=user_book.iloc[i].index
        raw_recommends =sorted(list(zip(user_list.tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
        return raw_recommends
    rating=0
    sum_similarities=0
    neigbors= make_recommendation(model_knn, user_id, user_book_mat[(user_book_mat[isbn]!=0) | (user_book_mat.index==user_id)])
    for j in range(len(neighbors):
        sum_similarities+= neigbors[j][1]
    for i in range(len(neigbors)):
        rating+= neigbors[i][1]* user_book_mat.loc[neigbors[i][0]][isbn]
    return (rating/sum_similarities)

# Example

In [94]:
print("The user-user based collaborative filtering rating for User 17 on the book 0891075275 using 2 neighbors is", rating(19, '0316769487', 2))

The user-user based collaborative filtering rating for User 17 on the book 0891075275 using 2 neighbors is 9.0


In [95]:
print("The acutal rating by this user on the given book as can be seen in the topredict dataframe is 6.0")

The acutal rating by this user on the given book as can be seen in the topredict dataframe is 6.0
