# Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import root_mean_squared_error
from scipy.sparse.linalg import svds
from sklearn.model_selection import KFold


# Read Data

In [2]:
books_data =pd.read_csv('Books.csv')
books_data
# parent_asin: Parent ID of the product

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1446304000,5,1.441260e+12
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1564770672,5,1.441260e+12
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1442450703,5,1.523090e+12
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1780671067,1,1.611620e+12
4,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1645671127,3,1.612040e+12
...,...,...,...,...
1048570,AH3EZV6Y6KIH5DYPZMCPGDVNXOGA,307986934,5,1.515860e+12
1048571,AH3EZV6Y6KIH5DYPZMCPGDVNXOGA,1400033411,5,1.515860e+12
1048572,AGYKAPDJ2TWJQUCTYDV5POTZCRWA,178221206X,5,1.473130e+12
1048573,AGYKAPDJ2TWJQUCTYDV5POTZCRWA,316217182,5,1.477420e+12


In [3]:
books_data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1048575 non-null  object 
 1   parent_asin  1048575 non-null  object 
 2   rating       1048575 non-null  int64  
 3   timestamp    1048575 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 32.0+ MB


# Ask User to input ID

In [4]:
valid_user_ids = books_data['user_id'].to_list()

def get_valid_user_id(): 
    while True:
        uid = input("Enter user ID: ") 
        if uid in valid_user_ids: 
            return uid 
        else: 
            print("Invalid ID. Please enter a valid ID.") 
        
uid = get_valid_user_id() 
print("Valid user ID entered:", uid)

Valid user ID entered: AE22M65RFUBDK73HHPM73G3IVPFA


# Popularity Base Model

In [5]:
ratings_grp = books_data.groupby('parent_asin').agg({'rating': [np.size, 'sum', 'mean']})

In [6]:
ratings_grp

Unnamed: 0_level_0,rating,rating,rating
Unnamed: 0_level_1,size,sum,mean
parent_asin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
000171211X,2,10,5.000000
000171287X,1,4,4.000000
000215854X,2,10,5.000000
000215949X,4,17,4.250000
000223078X,1,5,5.000000
...,...,...,...
B0C8GFSLG9,4,15,3.750000
B0C8GGPD1H,12,54,4.500000
B0C8GHMWG7,69,325,4.710145
B0C8GJYMNH,124,589,4.750000


In [7]:
size_filter = ratings_grp[('rating', 'size') ] > 300
mean_filter = ratings_grp[('rating', 'mean') ] >= 4

books_list = ratings_grp[size_filter & mean_filter]

In [8]:
popular_books = books_list.sort_values(('rating', 'mean') , ascending=False)
popular_books

Unnamed: 0_level_0,rating,rating,rating
Unnamed: 0_level_1,size,sum,mean
parent_asin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
679805273,1175,5769,4.909787
399226907,456,2216,4.859649
486789640,310,1493,4.816129
1780674880,600,2879,4.798333
545392551,470,2252,4.791489
1442450703,940,4496,4.782979
1524763136,302,1442,4.774834
312510780,482,2291,4.753112
B00JO8PEN2,649,3084,4.751926
1780671067,1585,7476,4.716719


In [9]:
def recommend_popular(df, pop_df, uid, n):
    read_books = df.loc[df['user_id'] == uid, 'parent_asin'].values 
    to_read = [asin for asin in pop_df.index if asin not in read_books] 
    
    return to_read[:n]

In [10]:
recommend_popular(books_data, popular_books, uid, 3)

['679805273', '399226907', '486789640']

# Item based Collaborative Filtering

In [11]:
pop_books = books_data[books_data['parent_asin'].isin(popular_books.index)]
pop_books

Unnamed: 0,user_id,parent_asin,rating,timestamp
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1442450703,5,1.523090e+12
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1780671067,1,1.611620e+12
15,AGKASBHYZPGTEPO6LWZPVJWB2BVA,803736800,4,1.454680e+12
27,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,920668372,5,1.430060e+12
134,AGKFRCDY4WBW4RH6GFYFJ3T3XBSA,B016ZNRC0Q,5,1.533430e+12
...,...,...,...,...
1048391,AHOTRV7O3LMRFLM3D3AFTMUZQZUA,312510780,5,1.542830e+12
1048451,AG3YEFX4MSN2JFQSQ5IVOZJEB2ZA,545392551,4,1.522870e+12
1048467,AG3YEFX4MSN2JFQSQ5IVOZJEB2ZA,545261244,4,1.538850e+12
1048549,AHV6YWP7LUK54DOTM56PUK76FNUA,312510780,5,1.455150e+12


In [12]:
pop_books.to_csv('popular_books.csv', index=False)

In [13]:
um = pop_books.pivot_table(index='user_id', columns='parent_asin', values='rating')
um

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE22GTKUFOI2DJ62HUAKOQJACGRA,5.0,,,,,,5.0,,,,...,,,,,,,,,,
AE22HGEZAMTLMOIYGFGMSTWZCBTQ,,,,,,,,,,,...,,4.0,,,,,,,,
AE22M65RFUBDK73HHPM73G3IVPFA,,,,,,,,,,,...,,,,5.0,,,,,,
AE22PJ54OVIRX3I6KSLMPRHPHA4A,,,,,,,,,,,...,,,5.0,,,,,,,
AE2354O5OHFEFYH6IL7KWZOBG3EA,,,,,,,,,,,...,,,4.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AHZZNY4I7DJBEGVSPF4Z6L55G7LA,,,,,,,,,,,...,,,,,,,,,,
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,,,,,,,5.0,,,,...,,,,,,,,,,
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,,,,,,,,,,,...,,,,,,,,,4.0,
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,,,,,,5.0,,,,,...,,,,,,,,,,


In [14]:
um.to_csv('um.csv', index=False)

### Build KNN Model using Utility Matrix

In [15]:
um_imputed = um.fillna(0)
um_trans_imputed = um_imputed.T

In [16]:
um_trans_imputed

user_id,AE22GTKUFOI2DJ62HUAKOQJACGRA,AE22HGEZAMTLMOIYGFGMSTWZCBTQ,AE22M65RFUBDK73HHPM73G3IVPFA,AE22PJ54OVIRX3I6KSLMPRHPHA4A,AE2354O5OHFEFYH6IL7KWZOBG3EA,AE237V5JNIBUKR3V3UFWRDMLHUCQ,AE23CIZ4OTQEFKSROZ3RHWCKBUIQ,AE23T6OM7RCPTDZIOGN34NNOBQQQ,AE23X3A5G7DT7BVOUSP4VIY5KG4Q,AE242RFKVOAWUWYFD2JQVUSZY7TA,...,AHZXVZECCDWVAHMZO5ORUMYQ4Y6Q,AHZYY6UQMPNWWNOLXTPUYHPY3ONA,AHZZ3WF5DQQTPZV2R3EPXIPHCIDA,AHZZ6Q6Y2NBBQDDCQHEEQMN4YHFA,AHZZDXW53XBSOZL4VDMLSCX6QMRQ,AHZZNY4I7DJBEGVSPF4Z6L55G7LA,AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,AHZZSUQJOYF7TNCKR4V3KFZJORZQ
parent_asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1442450703,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1524763136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1607747308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1780671067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1780674880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
312510780,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
399226907,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
399255370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
486789640,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
545261244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
nn = NearestNeighbors(n_neighbors=4)
nn.fit(um_trans_imputed)

In [18]:
neighbors = nn.kneighbors(um_trans_imputed, return_distance=False) 
neighbors

array([[ 0,  6, 15, 10],
       [ 1, 24, 26, 13],
       [ 2, 24, 13, 26],
       [ 3,  4,  8, 13],
       [ 4,  8, 26, 24],
       [ 5, 13, 26, 24],
       [ 6, 15, 26, 24],
       [ 7, 13, 24, 26],
       [ 8, 24, 26, 13],
       [ 9, 24, 26, 13],
       [10, 26, 24, 13],
       [11, 13, 15, 24],
       [12,  1, 24, 26],
       [13, 24, 26, 23],
       [14,  7, 26, 24],
       [15, 13, 26, 24],
       [16, 24, 26, 23],
       [17, 24, 26, 23],
       [18, 28, 26, 25],
       [19, 28, 24, 26],
       [20, 24, 17, 28],
       [21, 24, 23, 26],
       [22, 24, 26, 23],
       [23, 24, 26, 21],
       [24, 26, 23, 25],
       [25, 24, 26, 28],
       [26, 24, 23, 28],
       [27, 26, 28, 24],
       [28, 26, 24, 25]], dtype=int64)

In [19]:
def recommender_system(user,df, um_mat, neighbors, n):

    consumed = df.loc[df['user_id']==user, 'parent_asin'] # book already read by user
    best_items = df.loc[(df['user_id'] == user) & (df['rating'] == 5), 'parent_asin'] # top rated items
    best_list = []

    for item in best_items:
        idx = um_mat.index.get_loc(item)
        nearest = [um_mat.index[i] for i in neighbors[idx,1:] if um_mat.index[i] not in consumed]  
       

        best_list += list(nearest)

    return pd.Series(best_list).value_counts()[:n]

In [20]:
recommender_system(uid, pop_books, um_trans_imputed, neighbors, 3)

B01KXQ8SS6    1
B01M7XPGYE    1
B01B1OGQH4    1
Name: count, dtype: int64

### Build KNN model Using Correlation of um

In [21]:
um_corr = um.corr()
um_corr_imp = um_corr.fillna(0)

In [22]:
um_corr_imp

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
parent_asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1442450703,1.0,0.0,0.0,-0.311805,-0.333333,0.164141,0.285549,-0.146647,0.0,0.0,...,-0.25,-0.132453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1524763136,0.0,1.0,-0.57735,0.0,0.0,0.522233,0.0,0.0,0.0,-0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1607747308,0.0,-0.57735,1.0,-0.284277,0.0,-0.395285,0.0,-0.25,0.0,0.0,...,0.0,0.218218,0.0,0.0,0.0,0.0,0.866025,0.0,0.0,0.0
1780671067,-0.311805,0.0,-0.284277,1.0,0.66279,0.759961,0.0,0.534522,0.078586,0.0,...,-0.26968,0.44019,0.759257,0.165395,1.0,0.0,0.0,-0.408248,1.0,0.0
1780674880,-0.333333,0.0,0.0,0.66279,1.0,1.0,0.0,0.0,0.30099,0.0,...,0.0,-0.103695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
312510780,0.164141,0.522233,-0.395285,0.759961,1.0,1.0,-0.148168,-0.188982,0.0,-0.2,...,0.0,0.0,0.0,-0.333333,0.0,0.0,0.0,0.0,-0.5,0.0
399226907,0.285549,0.0,0.0,0.0,0.0,-0.148168,1.0,0.0,0.0,0.329956,...,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
399255370,-0.146647,0.0,-0.25,0.534522,0.0,-0.188982,0.0,1.0,0.0,0.0,...,0.870388,0.544949,0.0,0.0,0.0,0.0,0.0,0.0,0.4082483,0.0
486789640,0.0,0.0,0.0,0.078586,0.30099,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
545261244,0.0,-0.333333,0.0,0.0,0.0,-0.2,0.329956,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
nn_corr = NearestNeighbors(n_neighbors=4)
nn_corr.fit(um_corr_imp)

In [24]:
neighbors1 = nn_corr.kneighbors(um_corr_imp, return_distance=False) 
neighbors1

array([[ 0, 15,  1,  8],
       [ 1,  0,  8,  4],
       [ 2, 25,  8, 20],
       [ 3, 21, 20, 27],
       [ 4,  1,  8, 21],
       [ 5,  1,  4,  0],
       [ 6,  9, 14,  0],
       [ 7, 19, 20, 21],
       [ 8,  0,  1, 10],
       [ 9,  6,  0, 16],
       [10,  8, 11,  9],
       [11, 10,  6,  7],
       [12, 28, 24, 26],
       [13,  9, 10, 21],
       [14, 16,  6,  9],
       [15,  0,  1, 21],
       [16, 26, 14,  9],
       [17, 26, 27, 24],
       [18, 24, 20, 28],
       [19, 24, 20, 21],
       [20, 27, 21, 26],
       [21, 27, 24, 20],
       [22, 21, 18, 24],
       [23, 21, 20, 25],
       [24, 19, 28, 25],
       [25, 24, 20, 19],
       [26, 24, 20, 16],
       [27, 20, 21, 24],
       [28, 24, 27, 26]], dtype=int64)

In [25]:
recommender_system(uid, pop_books, um_corr_imp, neighbors1, 3)

B00YTXTIDO    1
B00DPM7TIG    1
B01KXQ8SS6    1
Name: count, dtype: int64

# SVD Model

In [26]:
um

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE22GTKUFOI2DJ62HUAKOQJACGRA,5.0,,,,,,5.0,,,,...,,,,,,,,,,
AE22HGEZAMTLMOIYGFGMSTWZCBTQ,,,,,,,,,,,...,,4.0,,,,,,,,
AE22M65RFUBDK73HHPM73G3IVPFA,,,,,,,,,,,...,,,,5.0,,,,,,
AE22PJ54OVIRX3I6KSLMPRHPHA4A,,,,,,,,,,,...,,,5.0,,,,,,,
AE2354O5OHFEFYH6IL7KWZOBG3EA,,,,,,,,,,,...,,,4.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AHZZNY4I7DJBEGVSPF4Z6L55G7LA,,,,,,,,,,,...,,,,,,,,,,
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,,,,,,,5.0,,,,...,,,,,,,,,,
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,,,,,,,,,,,...,,,,,,,,,4.0,
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,,,,,,5.0,,,,,...,,,,,,,,,,


In [27]:
um_imputed

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE22GTKUFOI2DJ62HUAKOQJACGRA,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AE22HGEZAMTLMOIYGFGMSTWZCBTQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AE22M65RFUBDK73HHPM73G3IVPFA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
AE22PJ54OVIRX3I6KSLMPRHPHA4A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AE2354O5OHFEFYH6IL7KWZOBG3EA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AHZZNY4I7DJBEGVSPF4Z6L55G7LA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
um_means = np.mean(um_imputed, axis=1)
um_means

user_id
AE22GTKUFOI2DJ62HUAKOQJACGRA    0.517241
AE22HGEZAMTLMOIYGFGMSTWZCBTQ    0.137931
AE22M65RFUBDK73HHPM73G3IVPFA    0.172414
AE22PJ54OVIRX3I6KSLMPRHPHA4A    0.517241
AE2354O5OHFEFYH6IL7KWZOBG3EA    0.137931
                                  ...   
AHZZNY4I7DJBEGVSPF4Z6L55G7LA    0.137931
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ    0.172414
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ    0.137931
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ    0.172414
AHZZSUQJOYF7TNCKR4V3KFZJORZQ    0.137931
Length: 12748, dtype: float64

In [29]:
um_demeaned = um_imputed - um_means.values.reshape(-1,1)
um_demeaned

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE22GTKUFOI2DJ62HUAKOQJACGRA,4.482759,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,4.482759,-0.517241,-0.517241,-0.517241,...,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241
AE22HGEZAMTLMOIYGFGMSTWZCBTQ,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,3.862069,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931
AE22M65RFUBDK73HHPM73G3IVPFA,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,...,-0.172414,-0.172414,-0.172414,4.827586,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414
AE22PJ54OVIRX3I6KSLMPRHPHA4A,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,...,-0.517241,-0.517241,4.482759,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241,-0.517241
AE2354O5OHFEFYH6IL7KWZOBG3EA,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,-0.137931,3.862069,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AHZZNY4I7DJBEGVSPF4Z6L55G7LA,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,4.827586,-0.172414,-0.172414,-0.172414,...,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,...,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,-0.137931,3.862069,-0.137931
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,4.827586,-0.172414,-0.172414,-0.172414,-0.172414,...,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414,-0.172414


In [30]:
r = np.linalg.matrix_rank(um_demeaned)
r

28

In [31]:
svd = TruncatedSVD(n_components=28, random_state=42)
svd.fit(um_demeaned)

In [32]:
import pickle
with open('svd.pickle', 'wb') as f:
    pickle.dump(svd,f)

In [33]:
from sklearn.utils.extmath import randomized_svd 
U, sigma, Vt = randomized_svd(um_demeaned.to_numpy(), n_components=28)

In [34]:
with open('U_sigma_Vt.pickle', 'wb') as f:
    pickle.dump((U,sigma,Vt),f)

In [35]:
U.shape, sigma.shape, Vt.shape

((12748, 28), (28,), (28, 29))

In [36]:
sigma = np.diag(sigma)
um_repro = U@sigma@Vt
um_repro +=  um_means.values.reshape(-1,1)

In [37]:
um_repro  = pd.DataFrame(um_repro, index=um_imputed.index, columns=um_imputed.columns) 

In [38]:
um_repro

parent_asin,1442450703,1524763136,1607747308,1780671067,1780674880,312510780,399226907,399255370,486789640,545261244,...,B00JO8PEN2,B00L9B7IKE,B00YTXTIDO,B016ZNRC0Q,B01B1OGQH4,B01KXQ8SS6,B01L1CEZ6K,B01M7XPGYE,B06Y1264PX,B07415PPP1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AE22GTKUFOI2DJ62HUAKOQJACGRA,5.000000e+00,-7.438494e-15,-8.548717e-15,-2.642331e-14,1.346701e-13,6.505907e-14,5.000000e+00,-1.298961e-14,1.465494e-14,3.064216e-14,...,-4.252154e-14,-1.787459e-14,4.329870e-15,3.885781e-15,-2.109424e-15,-1.620926e-14,-5.662137e-15,-4.440892e-16,-3.896883e-14,-2.831069e-14
AE22HGEZAMTLMOIYGFGMSTWZCBTQ,-3.897438e-13,-8.493206e-15,-9.076073e-15,-2.361999e-14,-5.898060e-14,-1.310341e-13,-1.013634e-13,-8.423817e-14,-1.162959e-14,-1.064704e-13,...,1.780798e-13,4.000000e+00,4.216072e-14,9.994783e-14,5.068168e-14,4.732326e-14,5.948020e-14,5.548340e-14,3.969047e-15,6.350476e-14
AE22M65RFUBDK73HHPM73G3IVPFA,-1.387779e-16,3.302913e-15,-4.302114e-15,-1.379452e-14,2.023381e-14,6.905587e-14,9.992007e-16,-1.568190e-14,7.605028e-15,5.548340e-14,...,-2.770006e-14,2.908784e-14,-9.159340e-16,5.000000e+00,-4.662937e-15,-3.386180e-15,-7.271961e-15,-6.716849e-15,-8.554268e-14,-4.440892e-15
AE22PJ54OVIRX3I6KSLMPRHPHA4A,-1.716405e-13,-7.771561e-16,3.885781e-15,-1.809664e-14,2.298162e-14,3.996803e-14,4.107825e-15,1.387779e-14,7.216450e-15,5.528911e-14,...,-5.617729e-14,-1.050271e-13,5.000000e+00,1.165734e-14,1.343370e-14,1.176836e-14,7.771561e-16,1.587619e-14,-2.331468e-14,6.994405e-15
AE2354O5OHFEFYH6IL7KWZOBG3EA,3.885781e-16,-4.365952e-14,-2.275957e-14,-4.574119e-14,6.239453e-14,1.594003e-13,1.023348e-13,-1.615375e-14,-4.019007e-14,7.630008e-14,...,9.461876e-14,-1.975642e-13,4.000000e+00,-9.214851e-15,-4.493628e-14,-2.939315e-14,-2.300937e-14,-3.363976e-14,-4.607426e-14,-1.107447e-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AHZZNY4I7DJBEGVSPF4Z6L55G7LA,-3.330669e-16,8.881784e-16,-6.383782e-16,2.220446e-15,-6.439294e-15,-8.770762e-15,7.410739e-15,-1.026956e-15,-9.992007e-16,-1.026956e-14,...,1.471046e-15,-6.522560e-15,3.386180e-15,-1.582068e-15,9.714451e-16,1.304512e-15,-3.608225e-16,-2.137179e-15,-3.774758e-15,1.831868e-15
AHZZO6AJJ2YNY75G2FGYHFY3A4HQ,-8.132384e-15,2.775558e-17,-6.161738e-15,2.498002e-15,-6.272760e-15,1.582068e-15,5.000000e+00,1.193490e-15,-2.498002e-16,-4.440892e-15,...,-3.191891e-15,-3.302913e-15,-1.582068e-15,8.326673e-15,1.443290e-15,6.383782e-16,6.661338e-16,-1.276756e-15,-2.775558e-15,1.665335e-15
AHZZQNSG7UUC6YE5SKKA4HMCOQUQ,-5.717649e-15,1.221245e-15,-8.326673e-16,2.775558e-15,-1.776357e-15,-2.331468e-15,-1.609823e-15,1.387779e-16,2.498002e-16,3.330669e-16,...,-3.691492e-15,9.159340e-15,8.326673e-16,4.551914e-15,8.326673e-16,1.110223e-15,0.000000e+00,-9.714451e-16,4.000000e+00,0.000000e+00
AHZZSIYMO7GGYHGEJA7D6K5MTSGQ,7.882583e-15,-2.914335e-15,3.247402e-15,-4.274359e-15,6.272760e-15,5.000000e+00,1.720846e-15,-4.163336e-16,3.552714e-15,-8.076873e-15,...,-4.218847e-15,-3.164136e-15,-2.775558e-17,1.942890e-16,-7.216450e-16,-7.216450e-16,1.942890e-15,2.081668e-15,-6.855627e-15,3.080869e-15


In [39]:
um_repro.to_csv('um_repro.csv', index=True)

In [40]:
# Predict books 
def recommend_books_svd(user, df, um, n): 
    consumed = df.loc[df['user_id']==user, 'parent_asin'] 
    user_books = um.loc[user,:]
    user_books = user_books.sort_values(ascending=False)
    user_books = user_books.drop(index=consumed)

    return user_books.index[:n]

In [41]:
with open('recommender_books_svd.pickle', 'wb') as f:
    pickle.dump('recommender_books_svd',f)

In [42]:
recommend_books_svd(uid, pop_books , um_repro, 3)

Index(['312510780', '545261244', 'B00DPM7TIG'], dtype='object', name='parent_asin')

# RMSE and difference between um and svd-reduced matrix


In [43]:
rmse = root_mean_squared_error(um_imputed.to_numpy(), um_repro)
print(f"RMSE ({rmse})")

RMSE (6.992281410137445e-15)


In [44]:
# Saving Models
with open('svd.pickle', 'wb') as f:
    pickle.dump(svd, f)

with open('U_sigma_Vt.pickle', 'wb') as f:
    pickle.dump((U, sigma, Vt), f)