In [None]:
!pip install scikit-surprise
from surprise.model_selection import GridSearchCV
from surprise import SVD

from surprise import Dataset
from surprise import Reader

from collections import defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/MyDrive/Colab Notebooks/Masters Python Workbooks/Redone Python Workbooks
books = pd.read_csv("Books.csv")
ratings = pd.read_csv("Ratings.csv")
users = pd.read_csv("Users.csv")
books["Book-Author"].fillna("Edinburgh Financial Publishing",inplace=True)
# We can replace the publisher name in these cases as well

# The publisher of 'Tyrant Moon' as per Amazon and FantasticFiction.com is 'Mundania Press LLC'
books['Publisher'].fillna("Mundania Press LLC", inplace=True)

# The publisher of 'Finders Keepers' as per GoodReads.com and Amazon is 'Bantam'
books['Publisher'].fillna("Bantam", inplace=True)

# Replacing the 0 aged users with nans so that they can be replaced in the next step
users.Age.replace(0,np.nan,inplace=True)

# Linear Interpolation of age in the dataset
users['Age'].interpolate(method = 'linear', limit_direction = 'forward',inplace=True)

# Dropping the rows with age > 100
users = users[users.Age <= 100]
book_rating = pd.merge(books,ratings,on='ISBN',how='outer')
book_rating = pd.merge(book_rating, users, on='User-ID', how='outer')
book_rating.drop(['Image-URL-S','Image-URL-M','Image-URL-L'],axis=1,inplace=True)
temp = book_rating[['ISBN','User-ID','Book-Rating']]
counts_user = temp.groupby('User-ID').agg(['count'])
counts_user.columns = ['ISBN_ct','rating_users_ct']
counts_book = temp.groupby('ISBN').agg(['count'])
counts_book.columns = ['ISBN_ct','rating_books_ct']
counts_user.drop(['ISBN_ct'], inplace=True, axis=1)
counts_book.drop(['ISBN_ct'], inplace=True, axis=1)
book_rating = pd.merge(book_rating, counts_book, on='ISBN',how='left')
book_rating = pd.merge(book_rating, counts_user, on='User-ID',how='left')

book_rating['City'] = book_rating['Location'].str.split(",",expand=True)[0]
book_rating['State'] = book_rating['Location'].str.split(",",expand=True)[1]
book_rating['Country'] = book_rating['Location'].str.split(",",expand=True)[2]
book_rating.drop('Location',axis=1,inplace=True)
df = book_rating.copy()

index_year = df[(df['Year-Of-Publication'] == 'DK Publishing Inc') | (df['Year-Of-Publication'] == 'Gallimard')].index
df.drop(index_year,inplace=True)
df = df.dropna(subset=['ISBN'])
df = df.dropna(subset=['Book-Title'])

# Dropping null values in User-ID as there could be no results if the user ID is not present
df = df.dropna(subset=['User-ID'])

# Deleting missing rows present in location as those are only 0.3% of the dataset
df.dropna(inplace=True)
# df.info()
# pd.isnull(df).sum()
df['Year-Of-Publication'] = df['Year-Of-Publication'].astype(int)

# One of the city names is '&#304;stanbul' which will be replaced with 'Istanbul'
df['City'].replace('&#304;stanbul','Istanbul',inplace=True)
df[df['Book-Title'] == 'Wild Animus']['ISBN'].value_counts()
most_popular_books = df[(df['rating_books_ct'] >= 568) & (df['rating_books_ct'] <= 2502.0)]
most_popular_books = most_popular_books[['ISBN','Book-Title','Book-Author','Publisher','rating_books_ct']]
most_popular_books = most_popular_books.drop_duplicates()
# books = list(most_popular_books['Book-Title'].values)
# temp = df.copy()
# df = temp[~temp['Book-Title'].isin(books)]
label_encoder = LabelEncoder()
# Creating a copy that will contain all the label encoders instead of the actual categorical variables
df1 = df.copy()
df1['City'] = label_encoder.fit_transform(df1['City'])
columns = ['Book-Author','Book-Title','Year-Of-Publication','Publisher','State','Country']
df1['State'] = label_encoder.fit_transform(df1['State'])
df1['Book-Author'] = label_encoder.fit_transform(df1['Book-Author'])
df1['Book-Title'] = label_encoder.fit_transform(df1['Book-Title'])
df1['Year-Of-Publication'] = df1['Year-Of-Publication'].astype(str)
df1['Year-Of-Publication'] = label_encoder.fit_transform(df1['Year-Of-Publication'])
df1['Publisher'] = label_encoder.fit_transform(df1['Publisher'])
df1['Country'] = label_encoder.fit_transform(df1['Country'])
# Creating another dataset that has only non categorical values that need to be scaled
numerical_df = df1[['Book-Title','Book-Author','Year-Of-Publication','Publisher','Book-Rating',
                    'Age','rating_books_ct','rating_users_ct','City','State','Country']] 
scaler = StandardScaler()
df_scaled = scaler.fit_transform(numerical_df)
df_scaled = pd.DataFrame(df_scaled)
df_scaled.columns = ['Book-Title','Book-Author','Year-Of-Publication','Publisher','Book-Rating',
                    'Age','rating_books_ct','rating_users_ct','City','State','Country']


import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# model with k = 6
kmeans = KMeans(n_clusters=6, max_iter=50, random_state=0)
kmeans.fit(df_scaled)
df_6 = df.copy()
# assign the label
df_6['cluster_id'] = kmeans.labels_
# model with k = 8
kmeans = KMeans(n_clusters = 8, max_iter=50, random_state=0)
kmeans.fit(df_scaled)
df_8 = df.copy()
# assign the label
df_8['cluster_id'] = kmeans.labels_
# 8 
df_8_0 = df_8[df_8['cluster_id'] == 0]
df_8_1 = df_8[df_8['cluster_id'] == 1]
df_8_2 = df_8[df_8['cluster_id'] == 2]
df_8_3 = df_8[df_8['cluster_id'] == 3]
df_8_4 = df_8[df_8['cluster_id'] == 4]
df_8_5 = df_8[df_8['cluster_id'] == 5]
df_8_6 = df_8[df_8['cluster_id'] == 6]
df_8_7 = df_8[df_8['cluster_id'] == 7]
df_6_0 = df_6[df_6['cluster_id'] == 0]
df_6_1 = df_6[df_6['cluster_id'] == 1]
df_6_2 = df_6[df_6['cluster_id'] == 2]
df_6_3 = df_6[df_6['cluster_id'] == 3]
df_6_4 = df_6[df_6['cluster_id'] == 4]
df_6_5 = df_6[df_6['cluster_id'] == 5]

# Merging cluster 0 & 5
df_6_1 = pd.concat([df_6_1, df_6_3])
df_6_0 = pd.concat([df_6_0, df_6_5])

# Merging cluster 2 and 4
df_8_2 = pd.concat([df_8_2, df_8_4])
df_8_3 = pd.concat([df_8_3, df_8_6, df_8_7])
# 29.36% of data from cluster ID = 6 is overlapping with cluster 7, hence merging this as well!
# Since 5 clusters have been merged, we are remaining with 5 


# Context aware 6 clusters
df_6_0 = df_6_0.copy()
df_6_1 = df_6_1.copy()
df_6_2 = df_6_2.copy()
df_6_3 = df_6_4.copy()

# Context aware 8 clusters
df_8_0 = df_8_0.copy()
df_8_1 = df_8_1.copy()
df_8_2 = df_8_2.copy()
df_8_3 = df_8_3.copy()
df_8_4 = df_8_5.copy()


df_8_0['Book-Rating'] = df_8_0['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
df_8_1['Book-Rating'] = df_8_1['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
df_8_2['Book-Rating'] = df_8_2['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
df_8_3['Book-Rating'] = df_8_3['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
df_8_4['Book-Rating'] = df_8_4['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)

df_8_0.drop_duplicates()
df_8_1.drop_duplicates()
df_8_2.drop_duplicates()
df_8_3.drop_duplicates()
df_8_4.drop_duplicates()

df_6_0['Book-Rating'] = df_6_0['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
df_6_1['Book-Rating'] = df_6_1['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
df_6_2['Book-Rating'] = df_6_2['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)
df_6_3['Book-Rating'] = df_6_3['Book-Rating'].apply(lambda x: 0 if x>=1 else 1)

for i in [df_6_0, df_6_1, df_6_2, df_6_3]:
  i.drop_duplicates()
  print(i.shape)

/content/drive/MyDrive/Colab Notebooks/Masters Python Workbooks/Redone Python Workbooks
(460303, 14)
(529232, 14)
(3789, 14)
(34835, 14)


# Collaborative filtering
## PART C: User Similarity

## Clusters: 6 context aware

In [None]:
print("df_6_0: ",df_6_0.shape)
print("df_6_1: ",df_6_1.shape)
print("df_6_2: ",df_6_2.shape)
print("df_6_3: ",df_6_3.shape)

df_6_0:  (460303, 14)
df_6_1:  (529232, 14)
df_6_2:  (3789, 14)
df_6_3:  (34835, 14)


In [None]:
# df_6_1
df_6_1_1 = np.array_split(df_6_1,25)[0]
df_6_1_2 = np.array_split(df_6_1,25)[1]
df_6_1_3 = np.array_split(df_6_1,25)[2]
df_6_1_4 = np.array_split(df_6_1,25)[3]
df_6_1_5 = np.array_split(df_6_1,25)[4]
df_6_1_6 = np.array_split(df_6_1,25)[5]
df_6_1_7 = np.array_split(df_6_1,25)[6]
df_6_1_8 = np.array_split(df_6_1,25)[7]
df_6_1_9 = np.array_split(df_6_1,25)[8]
df_6_1_10 = np.array_split(df_6_1,25)[9]
df_6_1_11 = np.array_split(df_6_1,25)[10]
df_6_1_12 = np.array_split(df_6_1,25)[11]
df_6_1_13 = np.array_split(df_6_1,25)[12]
df_6_1_14 = np.array_split(df_6_1,25)[13]
df_6_1_15 = np.array_split(df_6_1,25)[14]
df_6_1_16 = np.array_split(df_6_1,25)[15]
df_6_1_17 = np.array_split(df_6_1,25)[16]
df_6_1_18 = np.array_split(df_6_1,25)[17]
df_6_1_19 = np.array_split(df_6_1,25)[18]
df_6_1_20 = np.array_split(df_6_1,25)[19]
df_6_1_21 = np.array_split(df_6_1,25)[20]
df_6_1_22 = np.array_split(df_6_1,25)[21]
df_6_1_23 = np.array_split(df_6_1,25)[22]
df_6_1_24 = np.array_split(df_6_1,25)[23]
df_6_1_25 = np.array_split(df_6_1,25)[24]

# df_6_0
df_6_0_1 = np.array_split(df_6_0,25)[0]
df_6_0_2 = np.array_split(df_6_0,25)[1]
df_6_0_3 = np.array_split(df_6_0,25)[2]
df_6_0_4 = np.array_split(df_6_0,25)[3]
df_6_0_5 = np.array_split(df_6_0,25)[4]
df_6_0_6 = np.array_split(df_6_0,25)[5]
df_6_0_7 = np.array_split(df_6_0,25)[6]
df_6_0_8 = np.array_split(df_6_0,25)[7]
df_6_0_9 = np.array_split(df_6_0,25)[8]
df_6_0_10 = np.array_split(df_6_0,25)[9]
df_6_0_11 = np.array_split(df_6_0,25)[10]
df_6_0_12 = np.array_split(df_6_0,25)[11]
df_6_0_13 = np.array_split(df_6_0,25)[12]
df_6_0_14 = np.array_split(df_6_0,25)[13]
df_6_0_15 = np.array_split(df_6_0,25)[14]
df_6_0_16 = np.array_split(df_6_0,25)[15]
df_6_0_17 = np.array_split(df_6_0,25)[16]
df_6_0_18 = np.array_split(df_6_0,25)[17]
df_6_0_19 = np.array_split(df_6_0,25)[18]
df_6_0_20 = np.array_split(df_6_0,25)[19]
df_6_0_21 = np.array_split(df_6_0,25)[20]
df_6_0_22 = np.array_split(df_6_0,25)[21]
df_6_0_23 = np.array_split(df_6_0,25)[22]
df_6_0_24 = np.array_split(df_6_0,25)[23]
df_6_0_25 = np.array_split(df_6_0,25)[24]

# df_6_3
df_6_3_1 = np.array_split(df_6_3,4)[0]
df_6_3_2 = np.array_split(df_6_3,4)[1]
df_6_3_3 = np.array_split(df_6_3,4)[2]
df_6_3_4 = np.array_split(df_6_3,4)[3]

### Function to do this user similarity automatically

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
from numpy import *

def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

def user_similarity(df_features, original_df):
  mean = np.nanmean(df_features, axis=1)
  df_subtracted = (df_features.T-mean).T
# User Similarity Matrix
  user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
  user_correlation[np.isnan(user_correlation)] = 0
  user_correlation[user_correlation<0]=0
  user_predicted_ratings = np.dot(user_correlation, df_features.fillna(0))
  user_final_rating = np.multiply(user_predicted_ratings,df_features)

  X  = user_final_rating.copy() 
  X = X[X>0]
  scaler = MinMaxScaler(feature_range=(1, 5))
  scaler.fit(X)
  y = (scaler.transform(X))
  df_ = original_df.pivot_table(index='User-ID',columns='ISBN',values='Book-Rating')
  # Finding total non-NaN value
  total_non_nan = np.count_nonzero(~np.isnan(y))
  rmse = (sum(sum((df_ - y )**2))/total_non_nan)**0.5
  print("The RMSE of {}: {}".format(get_df_name(original_df), rmse))

In [None]:
df_6_features_0_1 = df_6_0_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_2 = df_6_0_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_3 = df_6_0_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_4 = df_6_0_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_5 = df_6_0_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_6 = df_6_0_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_7 = df_6_0_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_8 = df_6_0_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_9 = df_6_0_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_10 = df_6_0_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_11 = df_6_0_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_12 = df_6_0_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_13 = df_6_0_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_14 = df_6_0_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_15 = df_6_0_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_16 = df_6_0_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_17 = df_6_0_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_18 = df_6_0_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_19 = df_6_0_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_20 = df_6_0_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_21 = df_6_0_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_22 = df_6_0_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_23 = df_6_0_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_24 = df_6_0_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_0_25 = df_6_0_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

In [None]:
user_similarity(df_6_features_0_1, df_6_0_1)
user_similarity(df_6_features_0_2, df_6_0_2)
user_similarity(df_6_features_0_3, df_6_0_3)
user_similarity(df_6_features_0_4, df_6_0_4)
user_similarity(df_6_features_0_5, df_6_0_5)
user_similarity(df_6_features_0_6, df_6_0_6)
user_similarity(df_6_features_0_7, df_6_0_7)
user_similarity(df_6_features_0_8, df_6_0_8)
user_similarity(df_6_features_0_9, df_6_0_9)
user_similarity(df_6_features_0_10, df_6_0_10)
user_similarity(df_6_features_0_11, df_6_0_11)
user_similarity(df_6_features_0_12, df_6_0_12)
user_similarity(df_6_features_0_13, df_6_0_13)
user_similarity(df_6_features_0_14, df_6_0_14)
user_similarity(df_6_features_0_15, df_6_0_15)
user_similarity(df_6_features_0_16, df_6_0_16)
user_similarity(df_6_features_0_17, df_6_0_17)
user_similarity(df_6_features_0_18, df_6_0_18)
user_similarity(df_6_features_0_19, df_6_0_19)
user_similarity(df_6_features_0_20, df_6_0_20)
user_similarity(df_6_features_0_21, df_6_0_21)
user_similarity(df_6_features_0_22, df_6_0_22)
user_similarity(df_6_features_0_23, df_6_0_23)
user_similarity(df_6_features_0_24, df_6_0_24)
user_similarity(df_6_features_0_25, df_6_0_25)

The RMSE of df_6_0_1: 1.5600426824844165
The RMSE of df_6_0_2: 1.6437456165569313
The RMSE of df_6_0_3: 1.4204183179257062
The RMSE of df_6_0_4: 1.5342355013486995
The RMSE of df_6_0_5: 1.524971371255978
The RMSE of df_6_0_6: 1.527420605890305
The RMSE of df_6_0_7: 1.5728275886969532
The RMSE of df_6_0_8: 1.6080350211568504
The RMSE of df_6_0_9: 1.6009820012699536
The RMSE of df_6_0_10: 1.5236631368451201
The RMSE of df_6_0_11: 1.62591713854495
The RMSE of df_6_0_12: 1.6116535267802259
The RMSE of df_6_0_13: 1.7093311350481808
The RMSE of df_6_0_14: 1.8453832946228028
The RMSE of df_6_0_15: 1.0360183369523701
The RMSE of df_6_0_16: 1.2405879777689293
The RMSE of df_6_0_17: 1.122138566009512
The RMSE of df_6_0_18: 1.1656173658749371
The RMSE of df_6_0_19: 1.2910190521295848
The RMSE of df_6_0_20: 1.2703091384640566
The RMSE of df_6_0_21: 1.356313710965314
The RMSE of df_6_0_22: 1.2854901999725696
The RMSE of df_6_0_23: 1.403965417632597
The RMSE of df_6_0_24: 1.4201799857187005
The RMSE

In [None]:
df_6_features_1_1 = df_6_1_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_2 = df_6_1_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_3 = df_6_1_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_4 = df_6_1_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_5 = df_6_1_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_6 = df_6_1_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_7 = df_6_1_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_8 = df_6_1_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_9 = df_6_1_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_10 = df_6_1_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_11 = df_6_1_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_12 = df_6_1_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_13 = df_6_1_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_14 = df_6_1_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_15 = df_6_1_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_16 = df_6_1_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_17 = df_6_1_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_18 = df_6_1_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_19 = df_6_1_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_20 = df_6_1_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_21 = df_6_1_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_22 = df_6_1_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_23 = df_6_1_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_24 = df_6_1_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_1_25 = df_6_1_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

user_similarity(df_6_features_1_1, df_6_1_1)
user_similarity(df_6_features_1_2, df_6_1_2)
user_similarity(df_6_features_1_3, df_6_1_3)
user_similarity(df_6_features_1_4, df_6_1_4)
user_similarity(df_6_features_1_5, df_6_1_5)
user_similarity(df_6_features_1_6, df_6_1_6)
user_similarity(df_6_features_1_7, df_6_1_7)
user_similarity(df_6_features_1_8, df_6_1_8)
user_similarity(df_6_features_1_9, df_6_1_9)
user_similarity(df_6_features_1_10, df_6_1_10)
user_similarity(df_6_features_1_11, df_6_1_11)
user_similarity(df_6_features_1_12, df_6_1_12)
user_similarity(df_6_features_1_13, df_6_1_13)
user_similarity(df_6_features_1_14, df_6_1_14)
user_similarity(df_6_features_1_15, df_6_1_15)
user_similarity(df_6_features_1_16, df_6_1_16)
user_similarity(df_6_features_1_17, df_6_1_17)
user_similarity(df_6_features_1_18, df_6_1_18)
user_similarity(df_6_features_1_19, df_6_1_19)
user_similarity(df_6_features_1_20, df_6_1_20)
user_similarity(df_6_features_1_21, df_6_1_21)
user_similarity(df_6_features_1_22, df_6_1_22)
user_similarity(df_6_features_1_23, df_6_1_23)
user_similarity(df_6_features_1_24, df_6_1_24)
user_similarity(df_6_features_1_25, df_6_1_25)

The RMSE of df_6_1_1: 1.7229642662294757
The RMSE of df_6_1_2: 1.631427755414752
The RMSE of df_6_1_3: 1.697294318776797
The RMSE of df_6_1_4: 1.6425645579996275
The RMSE of df_6_1_5: 1.6007249027855048
The RMSE of df_6_1_6: 1.7597492771062753
The RMSE of df_6_1_7: 1.676435027332528
The RMSE of df_6_1_8: 1.7644765527751631
The RMSE of df_6_1_9: 1.6658035177934685
The RMSE of df_6_1_10: 1.6750952881577772
The RMSE of df_6_1_11: 1.7664668978457374
The RMSE of df_6_1_12: 1.9155121998282942
The RMSE of df_6_1_13: 1.3284822661856106
The RMSE of df_6_1_14: nan
The RMSE of df_6_1_15: nan
The RMSE of df_6_1_16: nan
The RMSE of df_6_1_17: nan
The RMSE of df_6_1_18: nan
The RMSE of df_6_1_19: nan
The RMSE of df_6_1_20: nan
The RMSE of df_6_1_21: nan
The RMSE of df_6_1_22: nan
The RMSE of df_6_1_23: nan
The RMSE of df_6_1_24: nan


In [None]:
df_6_features_3_1 = df_6_3_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_3_2 = df_6_3_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_3_3 = df_6_3_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
df_6_features_3_4 = df_6_3_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
user_similarity(df_6_features_3_1, df_6_3_1)
user_similarity(df_6_features_3_2, df_6_3_2)
user_similarity(df_6_features_3_3, df_6_3_3)
user_similarity(df_6_features_3_4, df_6_3_4)

The RMSE of df_6_3_1: 0.0
The RMSE of df_6_3_2: 0.6448840875547504
The RMSE of df_6_3_3: 0.49885130671485345
The RMSE of df_6_3_4: 0.08902086949444664


In [None]:
df_6_features_2 = df_6_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
user_similarity(df_6_features_2, df_6_2)

The RMSE of df_6_2: 0.0


## 8 clusters context aware

In [None]:
print("df_8_0: ",df_8_0.shape)
print("df_8_1: ",df_8_1.shape)
print("df_8_2: ",df_8_2.shape)
print("df_8_3: ",df_8_3.shape)
print("df_8_4: ",df_8_4.shape)

df_8_0:  (184337, 14)
df_8_1:  (197017, 14)
df_8_2:  (187319, 14)
df_8_3:  (455697, 14)
df_8_4:  (3789, 14)


In [None]:
# df_8_2


# Breaking df_8_2
df_8_2_1 = np.array_split(df_8_2,5)[0]
df_8_2_2 = np.array_split(df_8_2,5)[1]
df_8_2_3 = np.array_split(df_8_2,5)[2]
df_8_2_4 = np.array_split(df_8_2,5)[3]
df_8_2_5 = np.array_split(df_8_2,5)[4]
# df_8_features_2_1 = df_8_2_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_2 = df_8_2_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_3 = df_8_2_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_4 = df_8_2_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_5 = df_8_2_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# user_similarity(df_8_features_2_1, df_8_2_1)
# user_similarity(df_8_features_2_2, df_8_2_2)
# user_similarity(df_8_features_2_3, df_8_2_3)
# user_similarity(df_8_features_2_4, df_8_2_4)
# user_similarity(df_8_features_2_5, df_8_2_5)

The RMSE of df_8_2_1: 1.894288897941656
The RMSE of df_8_2_2: 1.799691192757625
The RMSE of df_8_2_3: 1.8250221615860163
The RMSE of df_8_2_4: 1.8796322771145704
The RMSE of df_8_2_5: 1.96437211679043


In [None]:
# df_8_4
df_8_4_1 = np.array_split(df_8_4,5)[0]
df_8_4_2 = np.array_split(df_8_4,5)[1]
df_8_4_3 = np.array_split(df_8_4,5)[2]
df_8_4_4 = np.array_split(df_8_4,5)[3]
df_8_4_5 = np.array_split(df_8_4,5)[4]
# df_8_features_4_1 = df_8_4_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_4_2 = df_8_4_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_4_3 = df_8_4_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_4_4 = df_8_4_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_4_5 = df_8_4_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# user_similarity(df_8_features_4_1, df_8_4_1)
# user_similarity(df_8_features_4_2, df_8_4_2)
# user_similarity(df_8_features_4_3, df_8_4_3)
# user_similarity(df_8_features_4_4, df_8_4_4)
# user_similarity(df_8_features_4_5, df_8_4_5)

The RMSE of df_8_4_1: 0.0
The RMSE of df_8_4_2: 0.0
The RMSE of df_8_4_3: 0.0
The RMSE of df_8_4_4: 0.0
The RMSE of df_8_4_5: nan


In [None]:
# Breaking df_8_1
df_8_1_1 = np.array_split(df_8_1,10)[0]
df_8_1_2 = np.array_split(df_8_1,10)[1]
df_8_1_3 = np.array_split(df_8_1,10)[2]
df_8_1_4 = np.array_split(df_8_1,10)[3]
df_8_1_5 = np.array_split(df_8_1,10)[4]
df_8_1_6 = np.array_split(df_8_1,10)[5]
df_8_1_7 = np.array_split(df_8_1,10)[6]
df_8_1_8 = np.array_split(df_8_1,10)[7]
df_8_1_9 = np.array_split(df_8_1,10)[8]
df_8_1_10 = np.array_split(df_8_1,10)[9]

# df_8_features_1_1 = df_8_1_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_2 = df_8_1_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_3 = df_8_1_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_4 = df_8_1_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_5 = df_8_1_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_6 = df_8_1_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_7 = df_8_1_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_8 = df_8_1_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_9 = df_8_1_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_1_10 = df_8_1_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

# user_similarity(df_8_features_1_1, df_8_1_1)
# user_similarity(df_8_features_1_2, df_8_1_2)
# user_similarity(df_8_features_1_3, df_8_1_3)
# user_similarity(df_8_features_1_4, df_8_1_4)
# user_similarity(df_8_features_1_5, df_8_1_5)
# user_similarity(df_8_features_1_6, df_8_1_6)
# user_similarity(df_8_features_1_7, df_8_1_7)
# user_similarity(df_8_features_1_8, df_8_1_8)
# user_similarity(df_8_features_1_9, df_8_1_9)
# user_similarity(df_8_features_1_10, df_8_1_10)

The RMSE of df_8_1_1: 1.7432274861307073
The RMSE of df_8_1_2: 1.7459095424428517
The RMSE of df_8_1_3: 1.6372614247545607
The RMSE of df_8_1_4: 1.6475761648231582
The RMSE of df_8_1_5: 1.6818720216679395
The RMSE of df_8_1_6: 1.713218790534816
The RMSE of df_8_1_7: 1.6776713338601745
The RMSE of df_8_1_8: 1.700343940523557
The RMSE of df_8_1_9: 1.7371260654427079
The RMSE of df_8_1_10: 1.7206288207067915


In [None]:
# Breaking df_8_0
df_8_0_1 = np.array_split(df_8_0,10)[0]
df_8_0_2 = np.array_split(df_8_0,10)[1]
df_8_0_3 = np.array_split(df_8_0,10)[2]
df_8_0_4 = np.array_split(df_8_0,10)[3]
df_8_0_5 = np.array_split(df_8_0,10)[4]
df_8_0_6 = np.array_split(df_8_0,10)[5]
df_8_0_7 = np.array_split(df_8_0,10)[6]
df_8_0_8 = np.array_split(df_8_0,10)[7]
df_8_0_9 = np.array_split(df_8_0,10)[8]
df_8_0_10 = np.array_split(df_8_0,10)[9]

# df_8_features_0_1 = df_8_0_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_2 = df_8_0_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_3 = df_8_0_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_4 = df_8_0_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_5 = df_8_0_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_6 = df_8_0_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_7 = df_8_0_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_8 = df_8_0_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_9 = df_8_0_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_0_10 = df_8_0_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

# user_similarity(df_8_features_0_1, df_8_0_1)
# user_similarity(df_8_features_0_2, df_8_0_2)
# user_similarity(df_8_features_0_3, df_8_0_3)
# user_similarity(df_8_features_0_4, df_8_0_4)
# user_similarity(df_8_features_0_5, df_8_0_5)
# user_similarity(df_8_features_0_6, df_8_0_6)
# user_similarity(df_8_features_0_7, df_8_0_7)
# user_similarity(df_8_features_0_8, df_8_0_8)
# user_similarity(df_8_features_0_9, df_8_0_9)
# user_similarity(df_8_features_0_10, df_8_0_10)

In [None]:
# Breaking df_8_2
df_8_2_1 = np.array_split(df_8_2,20)[0]
df_8_2_2 = np.array_split(df_8_2,20)[1]
df_8_2_3 = np.array_split(df_8_2,20)[2]
df_8_2_4 = np.array_split(df_8_2,20)[3]
df_8_2_5 = np.array_split(df_8_2,20)[4]
df_8_2_6 = np.array_split(df_8_2,20)[5]
df_8_2_7 = np.array_split(df_8_2,20)[6]
df_8_2_8 = np.array_split(df_8_2,20)[7]
df_8_2_9 = np.array_split(df_8_2,20)[8]
df_8_2_10 = np.array_split(df_8_2,20)[9]
df_8_2_11 = np.array_split(df_8_2,20)[10]
df_8_2_12 = np.array_split(df_8_2,20)[11]
df_8_2_13 = np.array_split(df_8_2,20)[12]
df_8_2_14 = np.array_split(df_8_2,20)[13]
df_8_2_15 = np.array_split(df_8_2,20)[14]
df_8_2_16 = np.array_split(df_8_2,20)[15]
df_8_2_17 = np.array_split(df_8_2,20)[16]
df_8_2_18 = np.array_split(df_8_2,20)[17]
df_8_2_19 = np.array_split(df_8_2,20)[18]
df_8_2_20 = np.array_split(df_8_2,20)[19]

# df_8_features_2_1 = df_8_2_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_2 = df_8_2_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_3 = df_8_2_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_4 = df_8_2_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_5 = df_8_2_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_6 = df_8_2_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_7 = df_8_2_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_8 = df_8_2_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_9 = df_8_2_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_10 = df_8_2_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_11 = df_8_2_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_12 = df_8_2_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_13 = df_8_2_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_14 = df_8_2_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_15 = df_8_2_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_16 = df_8_2_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_17 = df_8_2_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_18 = df_8_2_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_19 = df_8_2_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_2_20 = df_8_2_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

# user_similarity(df_8_features_2_1, df_8_2_1)
# user_similarity(df_8_features_2_2, df_8_2_2)
# user_similarity(df_8_features_2_3, df_8_2_3)
# user_similarity(df_8_features_2_4, df_8_2_4)
# user_similarity(df_8_features_2_5, df_8_2_5)
# user_similarity(df_8_features_2_6, df_8_2_6)
# user_similarity(df_8_features_2_7, df_8_2_7)
# user_similarity(df_8_features_2_8, df_8_2_8)
# user_similarity(df_8_features_2_9, df_8_2_9)
# user_similarity(df_8_features_2_10, df_8_2_10)
# user_similarity(df_8_features_2_11, df_8_2_11)
# user_similarity(df_8_features_2_12, df_8_2_12)
# user_similarity(df_8_features_2_13, df_8_2_13)
# user_similarity(df_8_features_2_14, df_8_2_14)
# user_similarity(df_8_features_2_15, df_8_2_15)
# user_similarity(df_8_features_2_16, df_8_2_16)
# user_similarity(df_8_features_2_17, df_8_2_17)
# user_similarity(df_8_features_2_18, df_8_2_18)
# user_similarity(df_8_features_2_19, df_8_2_19)
# user_similarity(df_8_features_2_20, df_8_2_20)

In [None]:
# Breaking df_8_3
df_8_3_1 = np.array_split(df_8_3,25)[0]
df_8_3_2 = np.array_split(df_8_3,25)[1]
df_8_3_3 = np.array_split(df_8_3,25)[2]
df_8_3_4 = np.array_split(df_8_3,25)[3]
df_8_3_5 = np.array_split(df_8_3,25)[4]
df_8_3_6 = np.array_split(df_8_3,25)[5]
df_8_3_7 = np.array_split(df_8_3,25)[6]
df_8_3_8 = np.array_split(df_8_3,25)[7]
df_8_3_9 = np.array_split(df_8_3,25)[8]
df_8_3_10 = np.array_split(df_8_3,25)[9]
df_8_3_11 = np.array_split(df_8_3,25)[10]
df_8_3_12 = np.array_split(df_8_3,25)[11]
df_8_3_13 = np.array_split(df_8_3,25)[12]
df_8_3_14 = np.array_split(df_8_3,25)[13]
df_8_3_15 = np.array_split(df_8_3,25)[14]
df_8_3_16 = np.array_split(df_8_3,25)[15]
df_8_3_17 = np.array_split(df_8_3,25)[16]
df_8_3_18 = np.array_split(df_8_3,25)[17]
df_8_3_19 = np.array_split(df_8_3,25)[18]
df_8_3_20 = np.array_split(df_8_3,25)[19]
df_8_3_21 = np.array_split(df_8_3,25)[20]
df_8_3_22 = np.array_split(df_8_3,25)[21]
df_8_3_23 = np.array_split(df_8_3,25)[22]
df_8_3_24 = np.array_split(df_8_3,25)[23]
df_8_3_25 = np.array_split(df_8_3,25)[24]

# df_8_features_3_1 = df_8_3_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_2 = df_8_3_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_3 = df_8_3_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_4 = df_8_3_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_5 = df_8_3_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_6 = df_8_3_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_7 = df_8_3_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_8 = df_8_3_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_9 = df_8_3_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_10 = df_8_3_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_11 = df_8_3_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_12 = df_8_3_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_13 = df_8_3_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_14 = df_8_3_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_15 = df_8_3_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_16 = df_8_3_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_17 = df_8_3_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_18 = df_8_3_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_19 = df_8_3_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_20 = df_8_3_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_21 = df_8_3_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_22 = df_8_3_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_23 = df_8_3_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_24 = df_8_3_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# df_8_features_3_25 = df_8_3_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)

# user_similarity(df_8_features_3_1, df_8_3_1)
# user_similarity(df_8_features_3_2, df_8_3_2)
# user_similarity(df_8_features_3_3, df_8_3_3)
# user_similarity(df_8_features_3_4, df_8_3_4)
# user_similarity(df_8_features_3_5, df_8_3_5)
# user_similarity(df_8_features_3_6, df_8_3_6)
# user_similarity(df_8_features_3_7, df_8_3_7)
# user_similarity(df_8_features_3_8, df_8_3_8)
# user_similarity(df_8_features_3_9, df_8_3_9)
# user_similarity(df_8_features_3_10, df_8_3_10)
# user_similarity(df_8_features_3_11, df_8_3_11)
# user_similarity(df_8_features_3_12, df_8_3_12)
# user_similarity(df_8_features_3_13, df_8_3_13)
# user_similarity(df_8_features_3_14, df_8_3_14)
# user_similarity(df_8_features_3_15, df_8_3_15)
# user_similarity(df_8_features_3_16, df_8_3_16)
# user_similarity(df_8_features_3_17, df_8_3_17)
# user_similarity(df_8_features_3_18, df_8_3_18)
# user_similarity(df_8_features_3_19, df_8_3_19)
# user_similarity(df_8_features_3_20, df_8_3_20)
# user_similarity(df_8_features_3_21, df_8_3_21)
# user_similarity(df_8_features_3_22, df_8_3_22)
# user_similarity(df_8_features_3_23, df_8_3_23)
# user_similarity(df_8_features_3_24, df_8_3_24)
# user_similarity(df_8_features_3_25, df_8_3_25)

In [None]:
# df_8_features_4 = df_8_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0)
# user_similarity(df_8_features_4, df_8_4)

The RMSE of df_8_4: 0.0


In [None]:
print("df_8_0: ",df_8_0.shape)
print("df_8_1: ",df_8_1.shape)
print("df_8_2: ",df_8_2.shape)
print("df_8_3: ",df_8_3.shape)
print("df_8_4: ",df_8_4.shape)

df_8_0:  (184337, 14)
df_8_1:  (197017, 14)
df_8_2:  (187319, 14)
df_8_3:  (455697, 14)
df_8_4:  (3789, 14)


### Item Similarity (6 clusters)

In [None]:
def item_similarity(df_features, original_df):
  mean = np.nanmean(df_features, axis=1)
  df_subtracted = (df_features.T-mean).T
# Item Similarity Matrix
  item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
  item_correlation[np.isnan(item_correlation)] = 0
  item_correlation[item_correlation<0]=0
  item_predicted_ratings = np.dot((df_features.fillna(0).T),item_correlation)
  dummy_df = original_df.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)
  item_final_rating = np.multiply(item_predicted_ratings,dummy_df)
  X  = item_final_rating.copy() 
  X = X[X>0]
  scaler = MinMaxScaler(feature_range=(1, 5))
  scaler.fit(X)
  y = (scaler.transform(X))
  df_ = original_df.pivot_table(index='User-ID',columns='ISBN',values='Book-Rating')
  # Finding total non-NaN value

  total_non_nan = np.count_nonzero(~np.isnan(y))
  rmse = (sum(sum((df_ - y )**2))/total_non_nan)**0.5
  print("{} (Item) RMSE: {}".format(get_df_name(original_df), rmse))

In [None]:
# df_6_0
# df_6_features_0_1 = df_6_0_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_2 = df_6_0_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_3 = df_6_0_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_4 = df_6_0_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_5 = df_6_0_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_6 = df_6_0_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_7 = df_6_0_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_8 = df_6_0_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_9 = df_6_0_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_10 = df_6_0_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_11 = df_6_0_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_12 = df_6_0_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_13 = df_6_0_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_14 = df_6_0_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_15 = df_6_0_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_16 = df_6_0_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_17 = df_6_0_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_18 = df_6_0_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_19 = df_6_0_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_20 = df_6_0_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_21 = df_6_0_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_22 = df_6_0_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_23 = df_6_0_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_0_24 = df_6_0_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_0_25 = df_6_0_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# item_similarity(df_6_features_0_1, df_6_0_1)
# item_similarity(df_6_features_0_2, df_6_0_2)
# item_similarity(df_6_features_0_3, df_6_0_3)
# item_similarity(df_6_features_0_4, df_6_0_4)
# item_similarity(df_6_features_0_5, df_6_0_5)
# item_similarity(df_6_features_0_6, df_6_0_6)
# item_similarity(df_6_features_0_7, df_6_0_7)
# item_similarity(df_6_features_0_8, df_6_0_8)
# item_similarity(df_6_features_0_9, df_6_0_9)
# item_similarity(df_6_features_0_10, df_6_0_10)
# item_similarity(df_6_features_0_11, df_6_0_11)
# item_similarity(df_6_features_0_12, df_6_0_12)
# item_similarity(df_6_features_0_13, df_6_0_13)
# item_similarity(df_6_features_0_14, df_6_0_14)
# item_similarity(df_6_features_0_15, df_6_0_15)
# item_similarity(df_6_features_0_16, df_6_0_16)
# item_similarity(df_6_features_0_17, df_6_0_17)
# item_similarity(df_6_features_0_18, df_6_0_18)
# item_similarity(df_6_features_0_19, df_6_0_19)
# item_similarity(df_6_features_0_20, df_6_0_20)
# item_similarity(df_6_features_0_21, df_6_0_21)
# item_similarity(df_6_features_0_22, df_6_0_22)
# item_similarity(df_6_features_0_23, df_6_0_23)
# item_similarity(df_6_features_0_24, df_6_0_24)
item_similarity(df_6_features_0_25, df_6_0_25)

df_6_0_25 (Item) RMSE: 0.9286912623112119


In [None]:
# df_6_features_1_1 = df_6_1_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_2 = df_6_1_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_3 = df_6_1_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_4 = df_6_1_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_5 = df_6_1_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_6 = df_6_1_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_7 = df_6_1_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_8 = df_6_1_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_9 = df_6_1_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_10 = df_6_1_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_11 = df_6_1_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_12 = df_6_1_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_13 = df_6_1_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_14 = df_6_1_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_15 = df_6_1_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_16 = df_6_1_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_17 = df_6_1_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_18 = df_6_1_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_19 = df_6_1_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_20 = df_6_1_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_21 = df_6_1_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_22 = df_6_1_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_23 = df_6_1_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_1_24 = df_6_1_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# df_6_features_1_25 = df_6_1_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
# 

# item_similarity(df_6_features_1_1, df_6_1_1)
# item_similarity(df_6_features_1_2, df_6_1_2)
# item_similarity(df_6_features_1_3, df_6_1_3)
# item_similarity(df_6_features_1_4, df_6_1_4)
# item_similarity(df_6_features_1_5, df_6_1_5)
# item_similarity(df_6_features_1_6, df_6_1_6)
# item_similarity(df_6_features_1_7, df_6_1_7)
# item_similarity(df_6_features_1_8, df_6_1_8)
# item_similarity(df_6_features_1_9, df_6_1_9)
# item_similarity(df_6_features_1_10, df_6_1_10)
# item_similarity(df_6_features_1_11, df_6_1_11)
# item_similarity(df_6_features_1_12, df_6_1_12)
# item_similarity(df_6_features_1_13, df_6_1_13)
item_similarity(df_6_features_1_14, df_6_1_14)
item_similarity(df_6_features_1_15, df_6_1_15)
item_similarity(df_6_features_1_16, df_6_1_16)
item_similarity(df_6_features_1_17, df_6_1_17)
item_similarity(df_6_features_1_18, df_6_1_18)
item_similarity(df_6_features_1_19, df_6_1_19)
item_similarity(df_6_features_1_20, df_6_1_20)
item_similarity(df_6_features_1_21, df_6_1_21)
item_similarity(df_6_features_1_22, df_6_1_22)
item_similarity(df_6_features_1_23, df_6_1_23)
item_similarity(df_6_features_1_24, df_6_1_24)
# item_similarity(df_6_features_1_25, df_6_1_25)

df_6_1_25 (Item) RMSE: nan


In [None]:
df_6_features_2 = df_6_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T

item_similarity(df_6_features_2, df_6_2)


df_6_2 (Item) RMSE: 0.0


In [None]:
df_6_features_3_1 = df_6_3_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_3_2 = df_6_3_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_3_3 = df_6_3_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_6_features_3_4 = df_6_3_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
item_similarity(df_6_features_3_1, df_6_3_1)
item_similarity(df_6_features_3_2, df_6_3_2)
item_similarity(df_6_features_3_3, df_6_3_3)
item_similarity(df_6_features_3_4, df_6_3_4)

df_6_3_1 (Item) RMSE: nan
df_6_3_2 (Item) RMSE: 0.6448840875547504
df_6_3_3 (Item) RMSE: 0.49885130671485356
df_6_3_4 (Item) RMSE: 0.0


### Item Similarity (8 clusters)

In [None]:
# df_8_0
df_8_features_0_1 = df_8_0_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_2 = df_8_0_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_3 = df_8_0_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_4 = df_8_0_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_5 = df_8_0_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_6 = df_8_0_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_7 = df_8_0_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_8 = df_8_0_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_9 = df_8_0_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_0_10 = df_8_0_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
item_similarity(df_8_features_0_1, df_8_0_1)
item_similarity(df_8_features_0_2, df_8_0_2)
item_similarity(df_8_features_0_3, df_8_0_3)
item_similarity(df_8_features_0_4, df_8_0_4)
item_similarity(df_8_features_0_5, df_8_0_5)
item_similarity(df_8_features_0_6, df_8_0_6)
item_similarity(df_8_features_0_7, df_8_0_7)
item_similarity(df_8_features_0_8, df_8_0_8)
item_similarity(df_8_features_0_9, df_8_0_9)
item_similarity(df_8_features_0_10, df_8_0_10)

df_8_0_1 (Item) RMSE: 1.396832013215738
df_8_0_2 (Item) RMSE: 1.2201242305741582
df_8_0_3 (Item) RMSE: 1.2797528144307604
df_8_0_4 (Item) RMSE: 1.3108715023489406
df_8_0_5 (Item) RMSE: 1.3581641199324894
df_8_0_6 (Item) RMSE: 1.3757759450719207
df_8_0_7 (Item) RMSE: 1.2937762493048095
df_8_0_8 (Item) RMSE: 1.2781354747992244
df_8_0_9 (Item) RMSE: 1.2917720900329406
df_8_0_10 (Item) RMSE: 0.9324384251955986


In [None]:
# Breaking df_8_1
df_8_1_1 = np.array_split(df_8_1,10)[0]
df_8_1_2 = np.array_split(df_8_1,10)[1]
df_8_1_3 = np.array_split(df_8_1,10)[2]
df_8_1_4 = np.array_split(df_8_1,10)[3]
df_8_1_5 = np.array_split(df_8_1,10)[4]
df_8_1_6 = np.array_split(df_8_1,10)[5]
df_8_1_7 = np.array_split(df_8_1,10)[6]
df_8_1_8 = np.array_split(df_8_1,10)[7]
df_8_1_9 = np.array_split(df_8_1,10)[8]
df_8_1_10 = np.array_split(df_8_1,10)[9]

df_8_features_1_1 = df_8_1_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_2 = df_8_1_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_3 = df_8_1_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_4 = df_8_1_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_5 = df_8_1_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_6 = df_8_1_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_7 = df_8_1_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_8 = df_8_1_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_9 = df_8_1_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_1_10 = df_8_1_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T

item_similarity(df_8_features_1_1, df_8_1_1)
item_similarity(df_8_features_1_2, df_8_1_2)
item_similarity(df_8_features_1_3, df_8_1_3)
item_similarity(df_8_features_1_4, df_8_1_4)
item_similarity(df_8_features_1_5, df_8_1_5)
item_similarity(df_8_features_1_6, df_8_1_6)
item_similarity(df_8_features_1_7, df_8_1_7)
item_similarity(df_8_features_1_8, df_8_1_8)
item_similarity(df_8_features_1_9, df_8_1_9)
item_similarity(df_8_features_1_10, df_8_1_10)

df_8_1_1 (Item) RMSE: 1.714121913649072
df_8_1_2 (Item) RMSE: 1.6403862984098143
df_8_1_3 (Item) RMSE: 1.6698209902222876
df_8_1_4 (Item) RMSE: 1.6055024978391526
df_8_1_5 (Item) RMSE: 1.6367157397289418
df_8_1_6 (Item) RMSE: 1.6994820900622465
df_8_1_7 (Item) RMSE: 1.6240315624976098
df_8_1_8 (Item) RMSE: 1.5927404737590494
df_8_1_9 (Item) RMSE: 1.519703314764013
df_8_1_10 (Item) RMSE: 1.3350889690707821


In [None]:
# Breaking df_8_2
df_8_2_1 = np.array_split(df_8_2,20)[0]
df_8_2_2 = np.array_split(df_8_2,20)[1]
df_8_2_3 = np.array_split(df_8_2,20)[2]
df_8_2_4 = np.array_split(df_8_2,20)[3]
df_8_2_5 = np.array_split(df_8_2,20)[4]
df_8_2_6 = np.array_split(df_8_2,20)[5]
df_8_2_7 = np.array_split(df_8_2,20)[6]
df_8_2_8 = np.array_split(df_8_2,20)[7]
df_8_2_9 = np.array_split(df_8_2,20)[8]
df_8_2_10 = np.array_split(df_8_2,20)[9]
df_8_2_11 = np.array_split(df_8_2,20)[10]
df_8_2_12 = np.array_split(df_8_2,20)[11]
df_8_2_13 = np.array_split(df_8_2,20)[12]
df_8_2_14 = np.array_split(df_8_2,20)[13]
df_8_2_15 = np.array_split(df_8_2,20)[14]
df_8_2_16 = np.array_split(df_8_2,20)[15]
df_8_2_17 = np.array_split(df_8_2,20)[16]
df_8_2_18 = np.array_split(df_8_2,20)[17]
df_8_2_19 = np.array_split(df_8_2,20)[18]
df_8_2_20 = np.array_split(df_8_2,20)[19]

df_8_features_2_1 = df_8_2_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_2 = df_8_2_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_3 = df_8_2_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_4 = df_8_2_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_5 = df_8_2_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_6 = df_8_2_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_7 = df_8_2_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_8 = df_8_2_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_9 = df_8_2_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_10 = df_8_2_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_11 = df_8_2_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_12 = df_8_2_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_13 = df_8_2_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_14 = df_8_2_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_15 = df_8_2_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_16 = df_8_2_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_17 = df_8_2_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_18 = df_8_2_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_19 = df_8_2_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_2_20 = df_8_2_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T

item_similarity(df_8_features_2_1, df_8_2_1)
item_similarity(df_8_features_2_2, df_8_2_2)
item_similarity(df_8_features_2_3, df_8_2_3)
item_similarity(df_8_features_2_4, df_8_2_4)
item_similarity(df_8_features_2_5, df_8_2_5)
item_similarity(df_8_features_2_6, df_8_2_6)
item_similarity(df_8_features_2_7, df_8_2_7)
item_similarity(df_8_features_2_8, df_8_2_8)
item_similarity(df_8_features_2_9, df_8_2_9)
item_similarity(df_8_features_2_10, df_8_2_10)
item_similarity(df_8_features_2_11, df_8_2_11)
item_similarity(df_8_features_2_12, df_8_2_12)
item_similarity(df_8_features_2_13, df_8_2_13)
item_similarity(df_8_features_2_14, df_8_2_14)
item_similarity(df_8_features_2_15, df_8_2_15)
item_similarity(df_8_features_2_16, df_8_2_16)
item_similarity(df_8_features_2_17, df_8_2_17)
item_similarity(df_8_features_2_18, df_8_2_18)
item_similarity(df_8_features_2_19, df_8_2_19)
item_similarity(df_8_features_2_20, df_8_2_20)

df_8_2_1 (Item) RMSE: 1.5393401993084934
df_8_2_2 (Item) RMSE: 1.5745823607799638
df_8_2_3 (Item) RMSE: 1.5198902717650866
df_8_2_4 (Item) RMSE: 1.4748014829755036
df_8_2_5 (Item) RMSE: 1.4628253100496513
df_8_2_6 (Item) RMSE: 1.463270007353674
df_8_2_7 (Item) RMSE: 1.4623949862574572
df_8_2_8 (Item) RMSE: 1.5069045302296264
df_8_2_9 (Item) RMSE: 1.5394016692433692
df_8_2_10 (Item) RMSE: 1.4985971590215077
df_8_2_11 (Item) RMSE: 1.5676038639375323
df_8_2_12 (Item) RMSE: 1.4539400302944274
df_8_2_13 (Item) RMSE: 1.511145722201283
df_8_2_14 (Item) RMSE: 1.5115227100210338
df_8_2_15 (Item) RMSE: 1.4200236330688751
df_8_2_16 (Item) RMSE: 1.4365765852507595
df_8_2_17 (Item) RMSE: 1.396261196639787
df_8_2_18 (Item) RMSE: 1.221549351708401
df_8_2_19 (Item) RMSE: 1.3480951134185637
df_8_2_20 (Item) RMSE: 1.6857322122596279


In [None]:
# Breaking df_8_3
df_8_3_1 = np.array_split(df_8_3,25)[0]
df_8_3_2 = np.array_split(df_8_3,25)[1]
df_8_3_3 = np.array_split(df_8_3,25)[2]
df_8_3_4 = np.array_split(df_8_3,25)[3]
df_8_3_5 = np.array_split(df_8_3,25)[4]
df_8_3_6 = np.array_split(df_8_3,25)[5]
df_8_3_7 = np.array_split(df_8_3,25)[6]
df_8_3_8 = np.array_split(df_8_3,25)[7]
df_8_3_9 = np.array_split(df_8_3,25)[8]
df_8_3_10 = np.array_split(df_8_3,25)[9]
df_8_3_11 = np.array_split(df_8_3,25)[10]
df_8_3_12 = np.array_split(df_8_3,25)[11]
df_8_3_13 = np.array_split(df_8_3,25)[12]
df_8_3_14 = np.array_split(df_8_3,25)[13]
df_8_3_15 = np.array_split(df_8_3,25)[14]
df_8_3_16 = np.array_split(df_8_3,25)[15]
df_8_3_17 = np.array_split(df_8_3,25)[16]
df_8_3_18 = np.array_split(df_8_3,25)[17]
df_8_3_19 = np.array_split(df_8_3,25)[18]
df_8_3_20 = np.array_split(df_8_3,25)[19]
df_8_3_21 = np.array_split(df_8_3,25)[20]
df_8_3_22 = np.array_split(df_8_3,25)[21]
df_8_3_23 = np.array_split(df_8_3,25)[22]
df_8_3_24 = np.array_split(df_8_3,25)[23]
df_8_3_25 = np.array_split(df_8_3,25)[24]

df_8_features_3_1 = df_8_3_1.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_2 = df_8_3_2.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_3 = df_8_3_3.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_4 = df_8_3_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_5 = df_8_3_5.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_6 = df_8_3_6.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_7 = df_8_3_7.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_8 = df_8_3_8.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_9 = df_8_3_9.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_10 = df_8_3_10.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_11 = df_8_3_11.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_12 = df_8_3_12.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_13 = df_8_3_13.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_14 = df_8_3_14.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_15 = df_8_3_15.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_16 = df_8_3_16.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_17 = df_8_3_17.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_18 = df_8_3_18.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_19 = df_8_3_19.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_20 = df_8_3_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_21 = df_8_3_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T



item_similarity(df_8_features_3_1, df_8_3_1)
item_similarity(df_8_features_3_2, df_8_3_2)
item_similarity(df_8_features_3_3, df_8_3_3)
item_similarity(df_8_features_3_4, df_8_3_4)
item_similarity(df_8_features_3_5, df_8_3_5)
item_similarity(df_8_features_3_6, df_8_3_6)
item_similarity(df_8_features_3_7, df_8_3_7)
item_similarity(df_8_features_3_8, df_8_3_8)
item_similarity(df_8_features_3_9, df_8_3_9)
item_similarity(df_8_features_3_10, df_8_3_10)
item_similarity(df_8_features_3_11, df_8_3_11)
item_similarity(df_8_features_3_12, df_8_3_12)
item_similarity(df_8_features_3_13, df_8_3_13)
item_similarity(df_8_features_3_14, df_8_3_14)
item_similarity(df_8_features_3_15, df_8_3_15)
item_similarity(df_8_features_3_16, df_8_3_16)
item_similarity(df_8_features_3_17, df_8_3_17)
item_similarity(df_8_features_3_18, df_8_3_18)
item_similarity(df_8_features_3_19, df_8_3_19)
item_similarity(df_8_features_3_20, df_8_3_20)
item_similarity(df_8_features_3_21, df_8_3_21)



df_8_3_1 (Item) RMSE: nan
df_8_3_2 (Item) RMSE: nan
df_8_3_3 (Item) RMSE: nan
df_8_3_4 (Item) RMSE: nan
df_8_3_5 (Item) RMSE: nan
df_8_3_6 (Item) RMSE: nan
df_8_3_7 (Item) RMSE: nan
df_8_3_8 (Item) RMSE: nan
df_8_3_9 (Item) RMSE: nan
df_8_3_10 (Item) RMSE: nan
df_8_3_11 (Item) RMSE: nan
df_8_3_12 (Item) RMSE: nan
df_8_3_13 (Item) RMSE: nan
df_8_3_14 (Item) RMSE: 1.681999238249848
df_8_3_15 (Item) RMSE: 1.7386368895846918
df_8_3_16 (Item) RMSE: 1.7050597999286763
df_8_3_17 (Item) RMSE: 1.7644902711190174
df_8_3_18 (Item) RMSE: 1.7446856050305013
df_8_3_19 (Item) RMSE: 1.7260494675923128
df_8_3_20 (Item) RMSE: 1.679248468450672
df_8_3_21 (Item) RMSE: 1.671586460185557


In [None]:
df_8_3_20 = np.array_split(df_8_3,25)[19]
df_8_3_21 = np.array_split(df_8_3,25)[20]
df_8_3_22 = np.array_split(df_8_3,25)[21]
df_8_3_23 = np.array_split(df_8_3,25)[22]
df_8_3_24 = np.array_split(df_8_3,25)[23]
df_8_3_25 = np.array_split(df_8_3,25)[24]
df_8_features_3_20 = df_8_3_20.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_21 = df_8_3_21.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_22 = df_8_3_22.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_23 = df_8_3_23.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_24 = df_8_3_24.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
df_8_features_3_25 = df_8_3_25.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
item_similarity(df_8_features_3_20, df_8_3_20)
item_similarity(df_8_features_3_21, df_8_3_21)
item_similarity(df_8_features_3_22, df_8_3_22)
item_similarity(df_8_features_3_23, df_8_3_23)
item_similarity(df_8_features_3_24, df_8_3_24)
item_similarity(df_8_features_3_25, df_8_3_25)

df_8_3_20 (Item) RMSE: 1.679248468450672
df_8_3_21 (Item) RMSE: 1.671586460185557
df_8_3_22 (Item) RMSE: 1.5901056492442764
df_8_3_23 (Item) RMSE: 1.4777018157881139
df_8_3_24 (Item) RMSE: 0.8085112611653922
df_8_3_25 (Item) RMSE: 0.6456732836943493


In [None]:
df_8_features_4 = df_8_4.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating',fill_value=0).T
item_similarity(df_8_features_4, df_8_4)

df_8_4 (Item) RMSE: 0.0


### SVD + Cross validation: 8 clusters

In [None]:
print("df_8_0: ",df_8_0.shape)
print("df_8_1: ",df_8_1.shape)
print("df_8_2: ",df_8_2.shape)
print("df_8_3: ",df_8_3.shape)
print("df_8_4: ",df_8_4.shape)
print("df_8_5: ",df_8_5.shape)

df_8_0:  (184337, 14)
df_8_1:  (197017, 14)
df_8_2:  (187319, 14)
df_8_3:  (455697, 14)
df_8_4:  (3789, 14)
df_8_5:  (3789, 14)


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_8_0[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.6361677109753942


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_8_1[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.2672233457958437


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_8_2[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.33557677582669493


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_8_3[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7726247514060948


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_8_4[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.5818637531890326


### SVD + Cross validation: 6 clusters

In [None]:
print("df_6_0: ",df_6_0.shape)
print("df_6_1: ",df_6_1.shape)
print("df_6_2: ",df_6_2.shape)
print("df_6_3: ",df_6_3.shape)

df_6_0:  (460303, 14)
df_6_1:  (529232, 14)
df_6_2:  (3789, 14)
df_6_3:  (34835, 14)


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_6_1[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.7216181800282581


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_6_2[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.582122868080993


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_6_3[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.6621464692180193


In [None]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df_6_0[["User-ID", "ISBN", "Book-Rating"]], reader)
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [1, 5, 10, 15, 20],
    "lr_all": [0.002, 0.005, 0.009, 0.01, 0.05],
    "reg_all": [0.2, 0.4, 0.6, 1]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)

gs.fit(data)

print(gs.best_score["rmse"])

0.44444756892748405
