In [None]:
# Importing relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from google.colab import drive
!pip install scikit-surprise
from surprise.model_selection import GridSearchCV
from surprise import SVD
from collections import defaultdict
from surprise import Dataset
from surprise import Reader
drive.mount('/content/drive')

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 4.8 MB/s 
Building wheels for collected packages: scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
%cd drive/MyDrive/Colab Notebooks/Masters Python Workbooks

# Reading necessary files
books = pd.read_csv("Books.csv")
ratings = pd.read_csv("Ratings.csv")
users = pd.read_csv("Users.csv")
books["Book-Author"].fillna("Edinburgh Financial Publishing",inplace=True)
# We can replace the publisher name in these cases

# The publisher of 'Tyrant Moon' as per Amazon and FantasticFiction.com is 'Mundania Press LLC'
books['Publisher'].fillna("Mundania Press LLC", inplace=True)

# The publisher of 'Finders Keepers' as per GoodReads.com and Amazon is 'Bantam'
books['Publisher'].fillna("Bantam", inplace=True)

# Replacing the 0 aged users with nans so that they can be replaced in the next step
users.Age.replace(0,np.nan,inplace=True)

# Linear Interpolation of age in the dataset
users['Age'].interpolate(method = 'linear', limit_direction = 'forward',inplace=True)

# Dropping the rows with age > 100
users = users[users.Age <= 100]

book_rating = pd.merge(books,ratings,on='ISBN',how='outer')
book_rating = pd.merge(book_rating, users, on='User-ID', how='outer')
book_rating.drop(['Image-URL-S','Image-URL-M','Image-URL-L'],axis=1,inplace=True)
temp = book_rating[['ISBN','User-ID','Book-Rating']]
counts_user = temp.groupby('User-ID').agg(['count'])
counts_user.columns = ['ISBN_ct','rating_users_ct']
counts_book = temp.groupby('ISBN').agg(['count'])
counts_book.columns = ['ISBN_ct','rating_books_ct']
counts_user.drop(['ISBN_ct'], inplace=True, axis=1)
counts_book.drop(['ISBN_ct'], inplace=True, axis=1)
book_rating = pd.merge(book_rating, counts_book, on='ISBN',how='left')
book_rating = pd.merge(book_rating, counts_user, on='User-ID',how='left')

book_rating['City'] = book_rating['Location'].str.split(",",expand=True)[0]
book_rating['State'] = book_rating['Location'].str.split(",",expand=True)[1]
book_rating['Country'] = book_rating['Location'].str.split(",",expand=True)[2]
book_rating.drop('Location',axis=1,inplace=True)
df = book_rating.copy()

index_year = df[(df['Year-Of-Publication'] == 'DK Publishing Inc') | (df['Year-Of-Publication'] == 'Gallimard')].index
df.drop(index_year,inplace=True)
df = df.dropna(subset=['ISBN'])
df = df.dropna(subset=['Book-Title'])

# Dropping null values in User-ID as there could be no results if the user ID is not present
df = df.dropna(subset=['User-ID'])

# Deleting missing rows present in location as those are only 0.3% of the dataset
df.dropna(inplace=True)
# df.info()
# pd.isnull(df).sum()
df['Year-Of-Publication'] = df['Year-Of-Publication'].astype(int)

# One of the city names is '&#304;stanbul' which will be replaced with 'Istanbul'
df['City'].replace('&#304;stanbul','Istanbul',inplace=True)
df[df['Book-Title'] == 'Wild Animus']['ISBN'].value_counts()
most_popular_books = df[(df['rating_books_ct'] >= 568) & (df['rating_books_ct'] <= 2502.0)]
most_popular_books = most_popular_books[['ISBN','Book-Title','Book-Author','Publisher','rating_books_ct']]
most_popular_books = most_popular_books.drop_duplicates()
most_popular_books.shape
books = list(most_popular_books['Book-Title'].values)
temp = df.copy()
df = temp[~temp['Book-Title'].isin(books)]

# df is the dataset to use for Part A

/content/drive/MyDrive/Colab Notebooks/Masters Python Workbooks


0971880107    2498
Name: ISBN, dtype: int64

# Collaborative filtering
## PART A: User Similarity

In [None]:
# df['Book-Rating'] = df['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)

df_1 = np.array_split(df,50)[0]
df_2 = np.array_split(df,50)[1]
df_3 = np.array_split(df,50)[2]
df_4 = np.array_split(df,50)[3]
df_5 = np.array_split(df,50)[4]
df_6 = np.array_split(df,50)[5]
df_7 = np.array_split(df,50)[6]
df_8 = np.array_split(df,50)[7]
df_9 = np.array_split(df,50)[8]
df_10 = np.array_split(df,50)[9]
df_11 = np.array_split(df,50)[10]
df_12 = np.array_split(df,50)[11]
df_13 = np.array_split(df,50)[12]
df_14 = np.array_split(df,50)[13]
df_15 = np.array_split(df,50)[14]
df_16 = np.array_split(df,50)[15]
df_17 = np.array_split(df,50)[16]
df_18 = np.array_split(df,50)[17]
df_19 = np.array_split(df,50)[18]
df_20 = np.array_split(df,50)[19]
df_21 = np.array_split(df,50)[20]
df_22 = np.array_split(df,50)[21]
df_23 = np.array_split(df,50)[22]
df_24 = np.array_split(df,50)[23]
df_25 = np.array_split(df,50)[24]
df_26 = np.array_split(df,50)[25]
df_27 = np.array_split(df,50)[26]
df_28 = np.array_split(df,50)[27]
df_29 = np.array_split(df,50)[28]
df_30 = np.array_split(df,50)[29]
df_31 = np.array_split(df,50)[30]
df_32 = np.array_split(df,50)[31]
df_33 = np.array_split(df,50)[32]
df_34 = np.array_split(df,50)[33]
df_35 = np.array_split(df,50)[34]
df_36 = np.array_split(df,50)[35]
df_37 = np.array_split(df,50)[36]
df_38 = np.array_split(df,50)[37]
df_39 = np.array_split(df,50)[38]
df_40 = np.array_split(df,50)[39]
df_41 = np.array_split(df,50)[40]
df_42 = np.array_split(df,50)[41]
df_43 = np.array_split(df,50)[42]
df_44 = np.array_split(df,50)[43]
df_45 = np.array_split(df,50)[44]
df_46 = np.array_split(df,50)[45]
df_47 = np.array_split(df,50)[46]
df_48 = np.array_split(df,50)[47]
df_49 = np.array_split(df,50)[48]
df_50 = np.array_split(df,50)[49]
l = [df_1, df_2,df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18, df_19, df_20, df_21, df_22, df_23, df_24, df_25, df_26, df_27,
     df_28, df_29, df_30, df_31, df_32, df_33, df_34, df_35, df_36, df_37, df_38, df_39, df_40, df_41, df_42, df_43, df_44, df_45, df_46, df_47, df_48, df_49, df_50]
for i in l:
  i.drop_duplicates()
  i['Book-Rating'] = i['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
  print(i.shape)     

(20564, 13)
(20564, 13)
(20564, 13)
(20564, 13)
(20564, 13)
(20564, 13)
(20564, 13)
(20564, 13)
(20564, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)
(20563, 13)


### Function to do this user similarity automatically

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from numpy import *

def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def user_similarity(df_features, original_df):
  mean = np.nanmean(df_features, axis=1)
  df_subtracted = (df_features.T-mean).T
# User Similarity Matrix
  user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
  user_correlation[np.isnan(user_correlation)] = 0
  user_correlation[user_correlation<0]=0
  user_predicted_ratings = np.dot(user_correlation, df_features.fillna(0))
  user_final_rating = np.multiply(user_predicted_ratings,df_features)

  X  = user_final_rating.copy() 
  X = X[X>0]
  scaler = MinMaxScaler(feature_range=(1, 5))
  scaler.fit(X)
  y = (scaler.transform(X))
  df_ = original_df.pivot_table(index='User-ID',columns='ISBN',values='Book-Rating')
  # Finding total non-NaN value
  total_non_nan = np.count_nonzero(~np.isnan(y))
  rmse = (sum(sum((df_ - y )**2))/total_non_nan)**0.5
  print("{} RMSE: {}".format(get_df_name(original_df), rmse))

In [None]:
df_features_1 = df_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_2 = df_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_3 = df_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_4 = df_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_5 = df_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_6 = df_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_7 = df_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_8 = df_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_9 = df_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_10 = df_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_11 = df_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_12 = df_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_13 = df_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_14 = df_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_15 = df_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_16 = df_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_17 = df_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_18 = df_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_19 = df_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_20 = df_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_21 = df_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_22 = df_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_23 = df_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_24 = df_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_25 = df_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_26 = df_26.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_27 = df_27.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_28 = df_28.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_29 = df_29.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_30 = df_30.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

In [None]:
df_features_31 = df_31.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_32 = df_32.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_33 = df_33.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_34 = df_34.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_35 = df_35.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_36 = df_36.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_37 = df_37.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_38 = df_38.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_39 = df_39.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_40 = df_40.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_41 = df_41.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_42 = df_42.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_43 = df_43.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_44 = df_44.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_45 = df_45.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_46 = df_46.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_47 = df_47.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_48 = df_48.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_49 = df_49.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_features_50 = df_50.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

In [None]:
user_similarity(df_features_1, df_1)
user_similarity(df_features_2, df_2)
user_similarity(df_features_3, df_3)
user_similarity(df_features_4, df_4)
user_similarity(df_features_5, df_5)
user_similarity(df_features_6, df_6)
user_similarity(df_features_7, df_7)
user_similarity(df_features_8, df_8)
user_similarity(df_features_9, df_9)
user_similarity(df_features_10, df_10)

df_1 RMSE: 1.0867928844751722
df_2 RMSE: 1.338711061261666
df_3 RMSE: 1.3942479993082806
df_4 RMSE: 1.443681043816436
df_5 RMSE: 1.4535007814712408
df_6 RMSE: 1.2498242468163274
df_7 RMSE: 1.348009403045302
df_8 RMSE: 1.2175337071870456
df_9 RMSE: 1.219016808231058
df_10 RMSE: 1.2353362645954076


In [None]:
user_similarity(df_features_11, df_11)
user_similarity(df_features_12, df_12)
user_similarity(df_features_13, df_13)
user_similarity(df_features_14, df_14)
user_similarity(df_features_15, df_15)
user_similarity(df_features_16, df_16)
user_similarity(df_features_17, df_17)
user_similarity(df_features_18, df_18)
user_similarity(df_features_19, df_19)
user_similarity(df_features_20, df_20)

df_11 RMSE: 1.3038826065187072
df_12 RMSE: 1.3823270237475473
df_13 RMSE: 1.2408821701789545
df_14 RMSE: 1.320816675706597
df_15 RMSE: 1.148376748686313
df_16 RMSE: 1.2963739024108818
df_17 RMSE: 1.3151657404072974
df_18 RMSE: 1.273618273960394
df_19 RMSE: 1.3686657251652372
df_20 RMSE: 1.2423288927668952


In [None]:
user_similarity(df_features_21, df_21)
user_similarity(df_features_22, df_22)
user_similarity(df_features_23, df_23)
user_similarity(df_features_24, df_24)
user_similarity(df_features_25, df_25)
user_similarity(df_features_26, df_26)
user_similarity(df_features_27, df_27)
user_similarity(df_features_28, df_28)
user_similarity(df_features_29, df_29)
user_similarity(df_features_30, df_30)

df_21 RMSE: 1.3273711924783087
df_22 RMSE: 1.2650185064439712
df_23 RMSE: 1.4234479137177127
df_24 RMSE: 1.471626683805672
df_25 RMSE: 1.3768921139334296
df_26 RMSE: 1.3904673047192524
df_27 RMSE: 1.2080065611133137
df_28 RMSE: 1.2650205287671519
df_29 RMSE: 1.2689852437264435
df_30 RMSE: 1.288442635628795


In [None]:
user_similarity(df_features_31, df_31)
user_similarity(df_features_32, df_32)
user_similarity(df_features_33, df_33)
user_similarity(df_features_34, df_34)
user_similarity(df_features_35, df_35)
user_similarity(df_features_36, df_36)
user_similarity(df_features_37, df_37)
user_similarity(df_features_38, df_38)
user_similarity(df_features_39, df_39)
user_similarity(df_features_40, df_40)

df_31 RMSE: 1.4592453514745114
df_32 RMSE: 1.3738045749371355
df_33 RMSE: 1.3955847472745437
df_34 RMSE: 1.3718707397868652
df_35 RMSE: 1.2426070413509545
df_36 RMSE: 1.3251009421414397
df_37 RMSE: 1.2947687585094085
df_38 RMSE: 1.3875369331766174
df_39 RMSE: 1.2743943393667985
df_40 RMSE: 1.2998935079776543


In [None]:
user_similarity(df_features_41, df_41)
user_similarity(df_features_42, df_42)
user_similarity(df_features_43, df_43)
user_similarity(df_features_44, df_44)
user_similarity(df_features_45, df_45)
user_similarity(df_features_46, df_46)
user_similarity(df_features_47, df_47)
user_similarity(df_features_48, df_48)
user_similarity(df_features_49, df_49)
user_similarity(df_features_50, df_50)

df_41 RMSE: 1.3166593768303179
df_42 RMSE: 1.4758284189209125
df_43 RMSE: 1.4459288148287504
df_44 RMSE: 1.4548764035686688
df_45 RMSE: 1.5295322193283072
df_46 RMSE: 1.581298451478824
df_47 RMSE: 1.510497565097399
df_48 RMSE: 1.329265471733706
df_49 RMSE: 0.9472196500730717
df_50 RMSE: 0.4853498825833991


### Item Similarity

In [None]:
def item_similarity(df_features, original_df):
  mean = np.nanmean(df_features, axis=1)
  df_subtracted = (df_features.T-mean).T
# Item Similarity Matrix
  item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
  item_correlation[np.isnan(item_correlation)] = 0
  item_correlation[item_correlation<0]=0
  item_predicted_ratings = np.dot((df_features.fillna(0).T),item_correlation)
  dummy_df = original_df.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
  item_final_rating = np.multiply(item_predicted_ratings,dummy_df)
  X  = item_final_rating.copy() 
  X = X[X>0]
  scaler = MinMaxScaler(feature_range=(1, 5))
  scaler.fit(X)
  y = (scaler.transform(X))
  df_ = original_df.pivot_table(index='User-ID',columns='ISBN',values='Book-Rating')
  # Finding total non-NaN value

  total_non_nan = np.count_nonzero(~np.isnan(y))
  rmse = (sum(sum((df_ - y )**2))/total_non_nan)**0.5
  print("{} (Item) RMSE: {}".format(get_df_name(original_df), rmse))

In [None]:
df_features_1 = df_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_2 = df_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_3 = df_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_4 = df_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_5 = df_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_6 = df_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_7 = df_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_8 = df_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_9 = df_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_10 = df_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_11 = df_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_12 = df_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_13 = df_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_14 = df_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_15 = df_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_16 = df_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_17 = df_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_18 = df_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_19 = df_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_20 = df_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_21 = df_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_22 = df_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_23 = df_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_24 = df_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_25 = df_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_26 = df_26.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_27 = df_27.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_28 = df_28.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_29 = df_29.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_30 = df_30.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T

In [None]:
df_features_31 = df_31.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_32 = df_32.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_33 = df_33.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_34 = df_34.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_35 = df_35.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_36 = df_36.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_37 = df_37.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_38 = df_38.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_39 = df_39.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_40 = df_40.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_41 = df_41.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_42 = df_42.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_43 = df_43.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_44 = df_44.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_45 = df_45.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_46 = df_46.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_47 = df_47.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_48 = df_48.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_features_49 = df_49.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T

In [None]:
item_similarity(df_features_1, df_1)
item_similarity(df_features_2, df_2)
item_similarity(df_features_3, df_3)
item_similarity(df_features_4, df_4)
item_similarity(df_features_5, df_5)
item_similarity(df_features_6, df_6)
item_similarity(df_features_7, df_7)
item_similarity(df_features_8, df_8)
item_similarity(df_features_9, df_9)
item_similarity(df_features_10, df_10)
item_similarity(df_features_11, df_11)
item_similarity(df_features_12, df_12)
item_similarity(df_features_13, df_13)
item_similarity(df_features_14, df_14)
item_similarity(df_features_15, df_15)
item_similarity(df_features_16, df_16)
item_similarity(df_features_17, df_17)
item_similarity(df_features_18, df_18)
item_similarity(df_features_19, df_19)
item_similarity(df_features_20, df_20)

df_1 (Item) RMSE: 1.0957111269079218
df_2 (Item) RMSE: 1.3969816882508723
df_3 (Item) RMSE: 1.4050075203222614
df_4 (Item) RMSE: 1.450065233793157
df_5 (Item) RMSE: 1.4790430160726842
df_6 (Item) RMSE: 1.268902460299449
df_7 (Item) RMSE: 1.3997121624089883
df_8 (Item) RMSE: 1.252398402628738
df_9 (Item) RMSE: 1.3263773746017584
df_10 (Item) RMSE: 1.2660459479459383
df_11 (Item) RMSE: 1.3120752919185945
df_12 (Item) RMSE: 1.3960103966766004
df_13 (Item) RMSE: 1.2259329777397203
df_14 (Item) RMSE: 1.2580007288097088
df_15 (Item) RMSE: 1.2697872967594381
df_16 (Item) RMSE: 1.3429617966903038
df_17 (Item) RMSE: 1.2884650593750877
df_18 (Item) RMSE: 1.258389737281222
df_19 (Item) RMSE: 1.3567201039637073
df_20 (Item) RMSE: 1.2882771150594816


In [None]:
item_similarity(df_features_21, df_21)
item_similarity(df_features_22, df_22)
item_similarity(df_features_23, df_23)
item_similarity(df_features_24, df_24)
item_similarity(df_features_25, df_25)
item_similarity(df_features_26, df_26)
item_similarity(df_features_27, df_27)
item_similarity(df_features_28, df_28)
item_similarity(df_features_29, df_29)
item_similarity(df_features_30, df_30)
item_similarity(df_features_31, df_31)
item_similarity(df_features_32, df_32)
item_similarity(df_features_33, df_33)
item_similarity(df_features_34, df_34)
item_similarity(df_features_35, df_35)
item_similarity(df_features_36, df_36)
item_similarity(df_features_37, df_37)
item_similarity(df_features_38, df_38)
item_similarity(df_features_39, df_39)
item_similarity(df_features_40, df_40)
item_similarity(df_features_41, df_41)
item_similarity(df_features_42, df_42)
item_similarity(df_features_43, df_43)
item_similarity(df_features_44, df_44)
item_similarity(df_features_45, df_45)
item_similarity(df_features_46, df_46)
item_similarity(df_features_47, df_47)
item_similarity(df_features_48, df_48)
item_similarity(df_features_49, df_49)

df_21 (Item) RMSE: 1.3311369703571163
df_22 (Item) RMSE: 1.2811383200066955
df_23 (Item) RMSE: 1.3880864317876296
df_24 (Item) RMSE: 1.4827947617474742
df_25 (Item) RMSE: 1.414297009662524
df_26 (Item) RMSE: 1.4659104921952175
df_27 (Item) RMSE: 1.2715714016091464
df_28 (Item) RMSE: 1.337563961479184
df_29 (Item) RMSE: 1.3353788616817333
df_30 (Item) RMSE: 1.309767147500664
df_31 (Item) RMSE: 1.4621208363830476
df_32 (Item) RMSE: 1.3349630057691162
df_33 (Item) RMSE: 1.4169521360166266
df_34 (Item) RMSE: 1.3876019206663504
df_35 (Item) RMSE: 1.2150234531142343
df_36 (Item) RMSE: 1.3157477763377636
df_37 (Item) RMSE: 1.2041822709718948
df_38 (Item) RMSE: 1.2993691630562207
df_39 (Item) RMSE: 1.1655296716738912
df_40 (Item) RMSE: 1.2234475465298071
df_41 (Item) RMSE: 1.1726948066457357
df_42 (Item) RMSE: 1.215098138572984
df_43 (Item) RMSE: 1.2008117172861918
df_44 (Item) RMSE: 1.1851294430386135
df_45 (Item) RMSE: 1.1885210567514046
df_46 (Item) RMSE: 1.2284189739105402
df_47 (Item) RMS

In [None]:
df_features_50 = df_50.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
item_similarity(df_features_50, df_50)

df_50 (Item) RMSE: 0.4961384314247441


### Function on SVD + cross validation

In [None]:
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(1, 10))
def SVD(df):
  data = Dataset.load_from_df(df[["User-ID", "ISBN", "Book-Rating"]], reader)
  param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
  }
  gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
  gs.fit(data)
  print("RMSE best score of {}: ".format(get_df_name(df),gs.best_score["rmse"]))
  return gs.best_score['rmse']

In [None]:
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_41[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7100313526834312


In [None]:
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import Dataset
from surprise import Reader
data = Dataset.load_from_df(df_42[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.712084399657669


In [None]:
data = Dataset.load_from_df(df_43[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7259853706157188


In [None]:
data = Dataset.load_from_df(df_44[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7359315754796772


In [None]:
data = Dataset.load_from_df(df_45[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7352736907017245


In [None]:
data = Dataset.load_from_df(df_46[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7303131064444165


In [None]:
data = Dataset.load_from_df(df_47[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7526943133967046


In [None]:
data = Dataset.load_from_df(df_48[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7596688158180389


In [None]:
data = Dataset.load_from_df(df_49[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7584856980143192


In [None]:
data = Dataset.load_from_df(df_50[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7774521578012727
