In [1]:
import pandas as pd
from random import randint

In [2]:
from generate_dataset import generate_data
df = generate_data(dataset_size=100000)


In [3]:
df.tail()

Unnamed: 0,book_id,author_id,book_genre,reader_id,num_pages,book_rating,publisher_id,publisher_year,book_price,text_lang
99995,947,346,6,20703,100,5,24,2020,71,3
99996,2251,159,4,1609,84,5,21,2001,53,7
99997,1804,73,5,13889,83,6,26,2001,118,2
99998,1405,166,7,10304,93,2,40,2017,110,5
99999,517,361,3,7865,84,4,34,2016,49,5


## Recommendation system

### Collaborative filtering system

In [4]:
import pandas as pd
import numpy as np

In [5]:
from scipy.sparse import csr_matrix

In [6]:
from scipy.sparse.linalg import svds

In [7]:
def normalize(pred_ratings):
    return ((pred_ratings-pred_ratings.min())/(pred_ratings.max()-pred_ratings.min()))

In [8]:
def generate_pred_df(mat,pt_df,n_factors):
    if not 1 <= n_factors < min(mat.shape):
        raise ValueError("Must be 1 <= n_factors < min(mat.shape)")
    
    u, s, v = svds(mat, k = n_factors)
    s = s = np.diag(s)

    pred_ratings = np.dot(np.dot(u, s), v) 
    pred_ratings = normalize(pred_ratings)

    pred_df = pd.DataFrame(
        pred_ratings,
        columns = pt_df.columns,
        index = list(pt_df.index)
    ).transpose()

    return pred_df



In [9]:
def recommend_items(pred_df, usr_id, n_recs):
    usr_pred = pred_df[usr_id].sort_values(ascending = False).reset_index().rename(columns = {usr_id : 'sim'})
    rec_df = usr_pred.sort_values(by = 'sim', ascending = False).head(n_recs)
    return rec_df

In [10]:
pt_df = df.pivot_table(
    columns = 'book_id',
    index = 'reader_id',
    values = 'book_rating'
).fillna(0)

In [11]:
pt_df

book_id,1,2,3,4,5,6,7,8,9,10,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,3000
reader_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
mat = pt_df.values

In [13]:
mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
mat = csr_matrix(mat)

In [15]:
mat

<28997x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 99944 stored elements in Compressed Sparse Row format>

In [16]:
pred_df = generate_pred_df(mat, pt_df, 10)

In [17]:
pred_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,29989,29991,29992,29993,29994,29995,29996,29997,29999,30000
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.160834,0.160596,0.160523,0.160551,0.160363,0.161696,0.160649,0.160726,0.160626,0.160959,...,0.160921,0.160754,0.161261,0.160607,0.161514,0.160861,0.160597,0.160501,0.160662,0.160557
2,0.160824,0.161204,0.160517,0.154216,0.161206,0.160314,0.160795,0.161405,0.160828,0.161781,...,0.160825,0.161170,0.161118,0.160631,0.160840,0.161193,0.160589,0.160842,0.160872,0.160618
3,0.160143,0.161531,0.160619,0.166376,0.162342,0.160303,0.160879,0.161421,0.160709,0.160720,...,0.161578,0.160878,0.160128,0.161063,0.159820,0.160724,0.160551,0.161319,0.161224,0.160804
4,0.161163,0.160536,0.160609,0.184119,0.160603,0.161236,0.160932,0.160563,0.160924,0.161255,...,0.160983,0.160893,0.160892,0.160899,0.160994,0.160775,0.160560,0.160564,0.160773,0.160540
5,0.161252,0.161074,0.160551,0.167456,0.160558,0.160766,0.160756,0.160740,0.161216,0.161278,...,0.160399,0.161140,0.160887,0.160677,0.161031,0.161101,0.160588,0.161005,0.160482,0.160539
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2996,0.161018,0.160444,0.160469,0.158204,0.160664,0.161306,0.160246,0.160647,0.160767,0.160292,...,0.160531,0.160840,0.160634,0.160526,0.161522,0.160791,0.160609,0.160604,0.160520,0.160619
2997,0.160728,0.160897,0.160498,0.156887,0.160971,0.162124,0.161031,0.161153,0.160709,0.160621,...,0.160479,0.160979,0.161232,0.161539,0.160686,0.161071,0.160538,0.161016,0.160811,0.160613
2998,0.160932,0.161558,0.160556,0.155302,0.161318,0.162009,0.160960,0.161226,0.161430,0.161229,...,0.161178,0.161188,0.160643,0.160699,0.161463,0.161061,0.160650,0.161922,0.160564,0.160796
2999,0.160454,0.161455,0.160605,0.158053,0.160815,0.161356,0.161777,0.161580,0.160624,0.163011,...,0.162052,0.160873,0.162291,0.160521,0.161126,0.161096,0.160576,0.161255,0.161035,0.160557


In [18]:
print(recommend_items(pred_df, 10, 10))

   book_id       sim
0      784  0.187667
1      327  0.183066
2     2732  0.178683
3       33  0.172035
4      788  0.171140
5     2431  0.170790
6      971  0.170193
7      657  0.169436
8      695  0.168669
9     1956  0.168445
