# Introduction

In this notebook, I developed a book recommendation system using various collaborative filtering techniques. I implemented popularity-based, item-based, and SVD-based recommendation models to suggest books to users. The performance of these models was evaluated using metrics such as RMSE to ensure the accuracy and effectiveness of our recommendations.


# Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import root_mean_squared_error
from scipy.sparse.linalg import svds
from sklearn.model_selection import KFold
from sklearn.utils.extmath import randomized_svd 

# Read Data

In [2]:
books_data =pd.read_csv('Books.csv')
books_data
# parent_asin: Parent ID of the product

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1446304000,5,1.441260e+12
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1564770672,5,1.441260e+12
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1442450703,5,1.523090e+12
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1780671067,1,1.611620e+12
4,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1645671127,3,1.612040e+12
...,...,...,...,...
1048570,AH3EZV6Y6KIH5DYPZMCPGDVNXOGA,307986934,5,1.515860e+12
1048571,AH3EZV6Y6KIH5DYPZMCPGDVNXOGA,1400033411,5,1.515860e+12
1048572,AGYKAPDJ2TWJQUCTYDV5POTZCRWA,178221206X,5,1.473130e+12
1048573,AGYKAPDJ2TWJQUCTYDV5POTZCRWA,316217182,5,1.477420e+12


In [3]:
books_data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1048575 non-null  object 
 1   parent_asin  1048575 non-null  object 
 2   rating       1048575 non-null  int64  
 3   timestamp    1048575 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 32.0+ MB


# Ask User to input ID

In [None]:
valid_user_ids = books_data['user_id'].to_list()

''' This function prompts the user to input a user ID and checks if it is valid.
It keeps asking for input until a valid user ID is entered.
A valid user ID is one that exists in the 'valid_user_ids' list.
Once a valid ID is entered, it returns the ID.
'''

def get_valid_user_id(): 
    while True:
        uid = input("Enter user ID: ") 
        if uid in valid_user_ids: 
            return uid 
        else: 
            print("Invalid ID. Please enter a valid ID.") 
        
uid = get_valid_user_id() 
print("Valid user ID entered:", uid)

Valid user ID entered: AE22M65RFUBDK73HHPM73G3IVPFA


# Popularity Base Model

In [None]:
# Group the books_data by 'parent_asin' and aggregate the 'rating' column
# We calculate the size (number of ratings), sum of ratings, and mean rating for each 'parent_asin'
ratings_grp = books_data.groupby('parent_asin').agg({'rating': [np.size, 'sum', 'mean']})

In [None]:
# Filter the books that have more than 300 ratings and an average rating of 4 or higher
size_filter = ratings_grp[('rating', 'size')] > 300
mean_filter = ratings_grp[('rating', 'mean')] >= 4

# Create a list of books that satisfy both conditions
books_list = ratings_grp[size_filter & mean_filter]

In [None]:
# Sort the books_list dataframe by the mean rating in descending order
# This will give us the most popular books based on their average rating
popular_books = books_list.sort_values(('rating', 'mean'), ascending=False)
popular_books

In [None]:
def recommend_popular(df, pop_df, uid, n):
    '''
    This function recommends popular books that a user has not read yet.
    
    Parameters:
    df (DataFrame): The dataframe containing user-book interactions.
    pop_df (DataFrame): The dataframe containing popular books.
    uid (str): The user ID for whom recommendations are to be made.
    n (int): The number of recommendations to return.
    
    Returns:
    list: A list of the top 'n' popular books that the user has not read yet.
    '''
    # Get the list of books that the user has already read
    read_books = df.loc[df['user_id'] == uid, 'parent_asin'].values 
    
    # Create a list of popular books that the user has not read yet
    to_read = [asin for asin in pop_df.index if asin not in read_books] 
    
    # Return the top 'n' books from the list of books to read
    return to_read[:n]

In [10]:
recommend_popular(books_data, popular_books, uid, 3)

['679805273', '399226907', '486789640']

# Item based Collaborative Filtering

In [None]:
# Filter the books_data to include only the books that are in the popular_books list
pop_books = books_data[books_data['parent_asin'].isin(popular_books.index)]
pop_books

Unnamed: 0,user_id,parent_asin,rating,timestamp
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1442450703,5,1.523090e+12
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1780671067,1,1.611620e+12
15,AGKASBHYZPGTEPO6LWZPVJWB2BVA,803736800,4,1.454680e+12
27,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,920668372,5,1.430060e+12
134,AGKFRCDY4WBW4RH6GFYFJ3T3XBSA,B016ZNRC0Q,5,1.533430e+12
...,...,...,...,...
1048391,AHOTRV7O3LMRFLM3D3AFTMUZQZUA,312510780,5,1.542830e+12
1048451,AG3YEFX4MSN2JFQSQ5IVOZJEB2ZA,545392551,4,1.522870e+12
1048467,AG3YEFX4MSN2JFQSQ5IVOZJEB2ZA,545261244,4,1.538850e+12
1048549,AHV6YWP7LUK54DOTM56PUK76FNUA,312510780,5,1.455150e+12


In [12]:
pop_books.to_csv('popular_books.csv', index=False)

In [None]:
# Create a utility matrix (pivot table) where rows represent users and columns represent books (parent_asin)
# The values in the matrix are the ratings given by users to books
um = pop_books.pivot_table(index='user_id', columns='parent_asin', values='rating')
um

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE22GTKUFOI2DJ62HUAKOQJACGRA,5.0,,,,,,5.0,,,,...,,,,,,,,,,
AE22HGEZAMTLMOIYGFGMSTWZCBTQ,,,,,,,,,,,...,,4.0,,,,,,,,
AE22M65RFUBDK73HHPM73G3IVPFA,,,,,,,,,,,...,,,,5.0,,,,,,
AE22PJ54OVIRX3I6KSLMPRHPHA4A,,,,,,,,,,,...,,,5.0,,,,,,,
AE2354O5OHFEFYH6IL7KWZOBG3EA,,,,,,,,,,,...,,,4.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AHZZNY4I7DJBEGVSPF4Z6L55G7LA,,,,,,,,,,,...,,,,,,,,,,
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,,,,,,,5.0,,,,...,,,,,,,,,,
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,,,,,,,,,,,...,,,,,,,,,4.0,
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,,,,,,5.0,,,,,...,,,,,,,,,,


In [14]:
um.to_csv('um.csv', index=False)

### Build KNN Model using Utility Matrix

In [None]:
# Fill missing values in the utility matrix with 0
um_imputed = um.fillna(0)

# Transpose the imputed utility matrix
um_trans_imputed = um_imputed.T

In [None]:
# Initialize the NearestNeighbors model with 4 neighbors
nn = NearestNeighbors(n_neighbors=4)

# Fit the model using the transposed and imputed utility matrix
nn.fit(um_trans_imputed)

In [None]:
# Find the nearest neighbors for each book in the transposed and imputed utility matrix
# The 'return_distance=False' parameter ensures that only the indices of the neighbors are returned
neighbors = nn.kneighbors(um_trans_imputed, return_distance=False) 
neighbors

array([[ 0,  6, 15, 10],
       [ 1, 24, 26, 13],
       [ 2, 24, 13, 26],
       [ 3,  4,  8, 13],
       [ 4,  8, 26, 24],
       [ 5, 13, 26, 24],
       [ 6, 15, 26, 24],
       [ 7, 13, 24, 26],
       [ 8, 24, 26, 13],
       [ 9, 24, 26, 13],
       [10, 26, 24, 13],
       [11, 13, 15, 24],
       [12,  1, 24, 26],
       [13, 24, 26, 23],
       [14,  7, 26, 24],
       [15, 13, 26, 24],
       [16, 24, 26, 23],
       [17, 24, 26, 23],
       [18, 28, 26, 25],
       [19, 28, 24, 26],
       [20, 24, 17, 28],
       [21, 24, 23, 26],
       [22, 24, 26, 23],
       [23, 24, 26, 21],
       [24, 26, 23, 25],
       [25, 24, 26, 28],
       [26, 24, 23, 28],
       [27, 26, 28, 24],
       [28, 26, 24, 25]], dtype=int64)

In [None]:
def recommender_system(user, df, um_mat, neighbors, n):
    '''
    This function recommends books to a user based on item-based collaborative filtering.
    
    Parameters:
    user (str): The user ID for whom recommendations are to be made.
    df (DataFrame): The dataframe containing user-book interactions.
    um_mat (DataFrame): The utility matrix where rows represent users and columns represent books.
    neighbors (ndarray): The array containing the indices of the nearest neighbors for each book.
    n (int): The number of recommendations to return.
    
    Returns:
    Series: A series containing the top 'n' recommended books for the user.
    '''
    
    # Get the list of books that the user has already read
    consumed = df.loc[df['user_id'] == user, 'parent_asin']
    
    # Get the list of books that the user has rated 5
    best_items = df.loc[(df['user_id'] == user) & (df['rating'] == 5), 'parent_asin']
    
    best_list = []

    # For each top-rated book, find the nearest neighbors that the user has not read yet
    for item in best_items:
        idx = um_mat.index.get_loc(item)
        nearest = [um_mat.index[i] for i in neighbors[idx, 1:] if um_mat.index[i] not in consumed]
        
        best_list += list(nearest)

    # Return the top 'n' recommended books
    return pd.Series(best_list).value_counts()[:n]

In [20]:
recommender_system(uid, pop_books, um_trans_imputed, neighbors, 3)

B01KXQ8SS6    1
B01M7XPGYE    1
B01B1OGQH4    1
Name: count, dtype: int64

### Build KNN model Using Correlation of um

In [None]:
# Calculate the correlation matrix for the utility matrix
um_corr = um.corr()

# Fill any missing values in the correlation matrix with 0
um_corr_imp = um_corr.fillna(0)

In [None]:
# Initialize the NearestNeighbors model with 4 neighbors using the correlation matrix
nn_corr = NearestNeighbors(n_neighbors=4)

# Fit the model using the imputed correlation matrix
nn_corr.fit(um_corr_imp)

In [None]:
# Find the nearest neighbors for each book in the imputed correlation matrix
# The 'return_distance=False' parameter ensures that only the indices of the neighbors are returned
neighbors1 = nn_corr.kneighbors(um_corr_imp, return_distance=False) 
neighbors1

array([[ 0, 15,  1,  8],
       [ 1,  0,  8,  4],
       [ 2, 25,  8, 20],
       [ 3, 21, 20, 27],
       [ 4,  1,  8, 21],
       [ 5,  1,  4,  0],
       [ 6,  9, 14,  0],
       [ 7, 19, 20, 21],
       [ 8,  0,  1, 10],
       [ 9,  6,  0, 16],
       [10,  8, 11,  9],
       [11, 10,  6,  7],
       [12, 28, 24, 26],
       [13,  9, 10, 21],
       [14, 16,  6,  9],
       [15,  0,  1, 21],
       [16, 26, 14,  9],
       [17, 26, 27, 24],
       [18, 24, 20, 28],
       [19, 24, 20, 21],
       [20, 27, 21, 26],
       [21, 27, 24, 20],
       [22, 21, 18, 24],
       [23, 21, 20, 25],
       [24, 19, 28, 25],
       [25, 24, 20, 19],
       [26, 24, 20, 16],
       [27, 20, 21, 24],
       [28, 24, 27, 26]], dtype=int64)

In [25]:
recommender_system(uid, pop_books, um_corr_imp, neighbors1, 3)

B00YTXTIDO    1
B00DPM7TIG    1
B01KXQ8SS6    1
Name: count, dtype: int64

# SVD Model

In [None]:
# Calculate the mean rating for each user (row) in the utility matrix
um_means = np.mean(um_imputed, axis=1)
um_means

user_id
AE22GTKUFOI2DJ62HUAKOQJACGRA    0.517241
AE22HGEZAMTLMOIYGFGMSTWZCBTQ    0.137931
AE22M65RFUBDK73HHPM73G3IVPFA    0.172414
AE22PJ54OVIRX3I6KSLMPRHPHA4A    0.517241
AE2354O5OHFEFYH6IL7KWZOBG3EA    0.137931
                                  ...   
AHZZNY4I7DJBEGVSPF4Z6L55G7LA    0.137931
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ    0.172414
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ    0.137931
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ    0.172414
AHZZSUQJOYF7TNCKR4V3KFZJORZQ    0.137931
Length: 12748, dtype: float64

In [None]:
# Subtract the mean rating of each user from their ratings to normalize the data
um_demeaned = um_imputed - um_means.values.reshape(-1,1)
um_demeaned

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE22GTKUFOI2DJ62HUAKOQJACGRA,4.482759,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,4.482759,-0.517241,-0.517241,-0.517241,...,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241
AE22HGEZAMTLMOIYGFGMSTWZCBTQ,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,3.862069,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931
AE22M65RFUBDK73HHPM73G3IVPFA,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,...,-0.172414,-0.172414,-0.172414,4.827586,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414
AE22PJ54OVIRX3I6KSLMPRHPHA4A,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,...,-0.517241,-0.517241,4.482759,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241
AE2354O5OHFEFYH6IL7KWZOBG3EA,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,-0.137931,3.862069,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AHZZNY4I7DJBEGVSPF4Z6L55G7LA,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,4.827586,-0.172414,-0.172414,-0.172414,...,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,3.862069,-0.137931
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,4.827586,-0.172414,-0.172414,-0.172414,-0.172414,...,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414


In [None]:
# Calculate the rank of the demeaned utility matrix
r = np.linalg.matrix_rank(um_demeaned)
r

28

In [None]:
# Initialize the TruncatedSVD model with 28 components and a random state of 42
# Fit the model using the demeaned utility matrix
svd = TruncatedSVD(n_components=28, random_state=42)
svd.fit(um_demeaned)

In [None]:
# Import the pickle module to save the trained SVD model to a file
# Save the trained SVD model to a file named 'svd.pickle'
import pickle
with open('svd.pickle', 'wb') as f:
    pickle.dump(svd,f)

In [None]:
# Perform randomized SVD on the demeaned utility matrix to obtain the U, sigma, and Vt matrices
# The number of components is set to 28 equal to the rank of the matrix
U, sigma, Vt = randomized_svd(um_demeaned.to_numpy(), n_components=28)

In [None]:
# Save the U, sigma, and Vt matrices to a file named 'U_sigma_Vt.pickle'
with open('U_sigma_Vt.pickle', 'wb') as f:
    pickle.dump((U, sigma, Vt), f)

In [35]:
U.shape, sigma.shape, Vt.shape

((12748, 28), (28,), (28, 29))

In [None]:
# Convert the singular values (sigma) into a diagonal matrix
sigma = np.diag(sigma)

# Reconstruct the utility matrix by multiplying the U, sigma, and Vt matrices
um_repro = U @ sigma @ Vt

# Add the mean ratings back to the reconstructed utility matrix to get the final predicted ratings
um_repro += um_means.values.reshape(-1, 1)

In [None]:
# Convert the reconstructed utility matrix (um_repro) into a DataFrame
# Set the index to be the same as the original utility matrix (um_imputed)
# Set the columns to be the same as the original utility matrix (um_imputed)
um_repro = pd.DataFrame(um_repro, index=um_imputed.index, columns=um_imputed.columns)

In [None]:
# Save the reconstructed utility matrix to a CSV file
um_repro.to_csv('um_repro.csv', index=True)

In [None]:
def recommend_books_svd(user, df, um, n):
    '''
    This function recommends books to a user based on the SVD model.
    
    Parameters:
    user (str): The user ID for whom recommendations are to be made.
    df (DataFrame): The dataframe containing user-book interactions.
    um (DataFrame): The reconstructed utility matrix with predicted ratings.
    n (int): The number of recommendations to return.
    
    Returns:
    Index: An index containing the top 'n' recommended books for the user.
    '''
    # Get the list of books that the user has already read
    consumed = df.loc[df['user_id'] == user, 'parent_asin']
    
    # Get the predicted ratings for the user from the reconstructed utility matrix
    user_books = um.loc[user, :]
    
    # Sort the predicted ratings in descending order
    user_books = user_books.sort_values(ascending=False)
    
    # Remove the books that the user has already read from the list of predicted ratings
    user_books = user_books.drop(index=consumed)
    
    # Return the top 'n' recommended books
    return user_books.index[:n]

In [None]:
# Save the 'recommender_books_svd' function to a file named 'recommender_books_svd.pickle'
with open('recommender_books_svd.pickle', 'wb') as f:
    pickle.dump('recommender_books_svd', f)

In [42]:
recommend_books_svd(uid, pop_books , um_repro, 3)

Index(['679805273', 'B00L9B7IKE', 'B00JO8PEN2'], dtype='object', name='parent_asin')

# RMSE and difference between um and svd-reduced matrix


In [None]:
# Calculate the Root Mean Squared Error (RMSE) between the original utility matrix and the reconstructed matrix
rmse = root_mean_squared_error(um_imputed.to_numpy(), um_repro)
print(f"RMSE ({rmse})")

RMSE (8.510422854198623e-15)


In [44]:
# Saving Models
with open('svd.pickle', 'wb') as f:
    pickle.dump(svd, f)

with open('U_sigma_Vt.pickle', 'wb') as f:
    pickle.dump((U, sigma, Vt), f)