# Collaborative filtering

In [1]:
import pandas as pd
import numpy as np

import json
import gzip

import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp

import math

sns.set_theme(style="darkgrid")

In [2]:
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count > head):
                break
    return pd.DataFrame(data)

In [3]:
reviews = load_data('data/raw/goodreads_reviews_dedup.json.gz', head=50000)

In [4]:
reviews.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,24375664,5cd416f3efc3f944fce4ce2db2290d5e,5,Mind blowingly cool. Best science fiction I've...,Fri Aug 25 13:55:02 -0700 2017,Mon Oct 09 08:55:59 -0700 2017,Sat Oct 07 00:00:00 -0700 2017,Sat Aug 26 00:00:00 -0700 2017,16,0
1,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
2,8842281e1d1347389f2ab93d60773d4d,6392944,5e212a62bced17b4dbe41150e5bb9037,3,I haven't read a fun mystery book in a while a...,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,Mon Jul 24 00:00:00 -0700 2017,6,0
3,8842281e1d1347389f2ab93d60773d4d,22078596,fdd13cad0695656be99828cd75d6eb73,4,"Fun, fast paced, and disturbing tale of murder...",Mon Jul 24 02:33:09 -0700 2017,Sun Jul 30 10:23:54 -0700 2017,Sun Jul 30 15:42:05 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,22,4
4,8842281e1d1347389f2ab93d60773d4d,6644782,bd0df91c9d918c0e433b9ab3a9a5c451,4,A fun book that gives you a sense of living in...,Mon Jul 24 02:28:14 -0700 2017,Thu Aug 24 00:07:20 -0700 2017,Sat Aug 05 00:00:00 -0700 2017,Sun Jul 30 00:00:00 -0700 2017,8,0


In [5]:
reviews = reviews[['user_id','book_id','rating']]

In [6]:
reviews.head()

Unnamed: 0,user_id,book_id,rating
0,8842281e1d1347389f2ab93d60773d4d,24375664,5
1,8842281e1d1347389f2ab93d60773d4d,18245960,5
2,8842281e1d1347389f2ab93d60773d4d,6392944,3
3,8842281e1d1347389f2ab93d60773d4d,22078596,4
4,8842281e1d1347389f2ab93d60773d4d,6644782,4


In [7]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50001 entries, 0 to 50000
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  50001 non-null  object
 1   book_id  50001 non-null  object
 2   rating   50001 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [8]:
means = reviews.groupby(['user_id'], as_index=False, sort=False).mean().rename(columns={'rating': 'mean_rating'})
print(len(means))
means.head()

1014


Unnamed: 0,user_id,mean_rating
0,8842281e1d1347389f2ab93d60773d4d,3.461806
1,72fb0d0087d28c832f15776b0d936598,3.4
2,ab2923b738ea3082f5f3efcbbfacb218,4.666667
3,d986f354a045ffb91234e4af4d1b12fd,3.0
4,7504b2aee1ecb5b2872d3da381c6c91e,3.0


In [9]:
reviews = reviews.merge(means, on='user_id', how='left', sort=False)
reviews['adjusted_rating'] = reviews['rating']-reviews['mean_rating']
reviews.head()

Unnamed: 0,user_id,book_id,rating,mean_rating,adjusted_rating
0,8842281e1d1347389f2ab93d60773d4d,24375664,5,3.461806,1.538194
1,8842281e1d1347389f2ab93d60773d4d,18245960,5,3.461806,1.538194
2,8842281e1d1347389f2ab93d60773d4d,6392944,3,3.461806,-0.461806
3,8842281e1d1347389f2ab93d60773d4d,22078596,4,3.461806,0.538194
4,8842281e1d1347389f2ab93d60773d4d,6644782,4,3.461806,0.538194


In [10]:
userbook = reviews.pivot_table(index='user_id', columns='book_id', values='adjusted_rating').fillna(0)

In [11]:
means = reviews.groupby(['user_id'], as_index=False, sort=False).mean().rename(columns={'rating': 'mean_rating'})
print(len(means))
means.head()

1014


Unnamed: 0,user_id,mean_rating,mean_rating.1,adjusted_rating
0,8842281e1d1347389f2ab93d60773d4d,3.461806,3.461806,-1.29526e-16
1,72fb0d0087d28c832f15776b0d936598,3.4,3.4,8.881784000000001e-17
2,ab2923b738ea3082f5f3efcbbfacb218,4.666667,4.666667,-2.960595e-16
3,d986f354a045ffb91234e4af4d1b12fd,3.0,3.0,0.0
4,7504b2aee1ecb5b2872d3da381c6c91e,3.0,3.0,0.0


In [12]:
userbook = reviews.pivot_table(index='user_id',columns='book_id', values='adjusted_rating').fillna(0)
userbook

book_id,1,10000,10000063,10000270,10000761,10002063,10004056,10005168,10005527,1000596,...,99955,9996331,9996853,9998680,9998920,9999,9999107,9999576,9999795,9999887
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
012515e5802b2e0f42915118c90fa04b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
012aa353140af13109d00ca36cdc0637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0147285ddc6b8a9ae27f1829a0bac33a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
015cb45454dc70dfc692eb17745fc6c0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01608badf9aa27fa949176f75a10cdfc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fe0df2eff573e75c036eb8287c6b012a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fe2746ac48c0ff728bf9d8475f3f42e8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fe98878f73553cc022af2a3ef0bb56b4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fed1d27323bb1994b8a28d82b1a29fce,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
def getSim(array1, array2, method):
    if method == 'cos':
        return sp.spatial.distance.cosine(array1, array2)
    if method == 'euc':
        return sp.spatial.distance.euclidean(array1, array2)
    if method == 'pea':
        num = sum([x1*x2 for x1, x2 in zip(array1, array2)])
        denom1 = 0
        denom2 = 0
        for x1, x2 in zip(array1, array2):
            if x1 != 0 and x2 != 0:
                denom1 += x1*x1
                denom2 += x2*x2 
        denom = np.sqrt(denom1)*np.sqrt(denom2)
        if denom == 0:
            return 0
        else:
            return num/denom

In [14]:
all_users = [(index, values) for index, values in zip(userbook.index, userbook.values)]
first_user = all_users[0]
first_user

('012515e5802b2e0f42915118c90fa04b', array([0., 0., 0., ..., 0., 0., 0.]))

In [20]:
def getSimilarUsers(initial_user, users, top=10, method='cos'):
    sim = []
    for user in users[1:]:
        score = getSim(initial_user[1], user[1], method)
        sim.append((user[0], score))
    sim.sort(key= lambda x: x[1], reverse=True)
    return sim[:top]

In [46]:
simusers = getSimilarUsers(first_user, all_users[1:], method='pea')

In [48]:
simusers

[('01608badf9aa27fa949176f75a10cdfc', 1.0),
 ('08920c9c72f03d03df72908ce393f7e7', 1.0),
 ('0dfb88ec00822bd46728cfbb46eed175', 1.0),
 ('1afe8b35c5e568e95bc17e5b5cdbfd1b', 1.0),
 ('3e6c3b92cc90df443925c58f8899be6e', 1.0),
 ('3ec80f997561136d3e330d548d62e1ed', 1.0),
 ('4e9865acc1001cecf64e642fb0be4d0e', 1.0),
 ('592c88e7f27f736c7f2ec9d9d85a5461', 1.0),
 ('5cca1dd30cd5a98c1c8e731839265ccf', 1.0),
 ('631853097d378547c63cc2c72be75cd3', 1.0)]

In [50]:
reduced_userbook = userbook.loc[[user for user,_ in simusers],:]

In [53]:
sumsim = sum([sim for _,sim in simusers])
print(sumsim)
reduced_userbook['sim'] = [sim for _,sim in simusers]
reco = []
for book_id in reduced_userbook.columns[:-1]:
    reduced_userbook[book_id] = reduced_userbook[book_id]*reduced_userbook['sim']
    reco.append((book_id,sum(reduced_userbook[book_id].tolist())/sumsim))
reco.sort(key= lambda x: x[1], reverse=True)

reco[:10]

10.0


[('8306857', 0.3617323117891429),
 ('10425811', 0.19973045822102425),
 ('10429082', 0.19973045822102425),
 ('10507293', 0.19973045822102425),
 ('10552965', 0.19973045822102425),
 ('10594356', 0.19973045822102425),
 ('10668038', 0.19973045822102425),
 ('11044367', 0.19973045822102425),
 ('11071466', 0.19973045822102425),
 ('11096647', 0.19973045822102425)]

In [56]:
reduced_userbook[['8306857','10425811','10429082']]

book_id,8306857,10425811,10429082
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01608badf9aa27fa949176f75a10cdfc,0.0,0.0,0.0
08920c9c72f03d03df72908ce393f7e7,0.692308,0.0,0.0
0dfb88ec00822bd46728cfbb46eed175,0.0,0.0,0.0
1afe8b35c5e568e95bc17e5b5cdbfd1b,0.927711,0.0,0.0
3e6c3b92cc90df443925c58f8899be6e,0.0,0.0,0.0
3ec80f997561136d3e330d548d62e1ed,0.0,0.0,0.0
4e9865acc1001cecf64e642fb0be4d0e,0.0,0.0,0.0
592c88e7f27f736c7f2ec9d9d85a5461,0.0,0.0,0.0
5cca1dd30cd5a98c1c8e731839265ccf,1.997305,1.997305,1.997305
631853097d378547c63cc2c72be75cd3,0.0,0.0,0.0
