In [12]:
import os
import json
import pandas as pd
import numpy as np
import tqdm

from collections import defaultdict

from surprise import SVD
from surprise import Dataset
from surprise import Reader

DATA_PATH = './data/'

%matplotlib inline

In [2]:
from scipy.sparse import coo_matrix, csr_matrix

In [46]:
with open(os.path.join(DATA_PATH, 'test_users.json'), 'r') as f:
    test_users = set(str(x) for x in json.load(f)['users'])

In [6]:
%%time
transactions = pd.read_csv(
    os.path.join(DATA_PATH, 'transactions.csv'),
    dtype={
        'element_uid': np.uint16,
        'user_uid': np.uint32,
        'consumption_mode': 'category',
        'ts': np.float64,
        'watched_time': np.uint64,
        'device_type': np.uint8,
        'device_manufacturer': np.uint8
    }
)

CPU times: user 7.06 s, sys: 412 ms, total: 7.47 s
Wall time: 7.88 s


In [25]:
%%time
ratings = pd.read_csv(
    os.path.join(DATA_PATH, 'ratings.csv'),
    dtype={
        'element_uid': np.object,
        'user_uid': np.object,
        'ts': np.float64,
        'rating': np.float64
    }
)

CPU times: user 322 ms, sys: 8.47 ms, total: 331 ms
Wall time: 338 ms


In [26]:
ratings.head()

Unnamed: 0,user_uid,element_uid,rating,ts
0,571252,1364,10.0,44305170.0
1,63140,3037,10.0,44305140.0
2,443817,4363,8.0,44305140.0
3,359870,1364,10.0,44305060.0
4,359870,3578,9.0,44305060.0


In [27]:
ratings.shape

(438790, 4)

In [28]:
ratings.dtypes

user_uid        object
element_uid     object
rating         float64
ts             float64
dtype: object

In [129]:
%%time

# A reader is still needed but only the rating_scale param is requiered.
#reader = Reader(rating_scale=(0, 10))
reader = Reader(line_format='user item rating timestamp', sep='\t')

# The columns must correspond to user id, item id and ratings (in that order).
#data = Dataset.load_from_df(ratings[['user_uid', 'element_uid', 'rating']], reader)
data = Dataset.load_from_file('ratings_for_surprise.csv', reader=reader)

trainset = data.build_full_trainset()
algo = SVD(random_state = 42)
algo.fit(trainset)

CPU times: user 43.4 s, sys: 69.6 ms, total: 43.5 s
Wall time: 43.6 s


In [31]:
from surprise.model_selection.validation import cross_validate
from surprise.model_selection.search import GridSearchCV

param_grid = {'n_factors':[3, 10, 20], 
              'n_epochs': [10, 30, 50],
              'lr_all': [0.001, 0.01], 
              'reg_all': [0.01, 0.3]
             }
gs = GridSearchCV(SVD, param_grid, cv=3, n_jobs=3, joblib_verbose=1)
gs.fit(data)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  9.9min
[Parallel(n_jobs=3)]: Done 108 out of 108 | elapsed: 27.1min finished


In [32]:
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

print(gs.best_params['rmse'])
print(gs.best_params['mae'])

1.8121488224837197
1.3492190145100131
{'reg_all': 0.3, 'lr_all': 0.01, 'n_epochs': 30, 'n_factors': 3}
{'reg_all': 0.3, 'lr_all': 0.01, 'n_epochs': 30, 'n_factors': 3}


In [33]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df

Unnamed: 0,mean_fit_time,mean_test_mae,mean_test_rmse,mean_test_time,param_lr_all,param_n_epochs,param_n_factors,param_reg_all,params,rank_test_mae,...,split0_test_mae,split0_test_rmse,split1_test_mae,split1_test_rmse,split2_test_mae,split2_test_rmse,std_fit_time,std_test_mae,std_test_rmse,std_test_time
0,2.939929,1.442987,1.893394,2.091502,0.001,10,3,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",27,...,1.445809,1.898325,1.441832,1.889576,1.44132,1.892282,0.119275,0.002006,0.003657,0.037592
1,3.484743,1.443101,1.893552,2.048743,0.001,10,10,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",28,...,1.44583,1.898418,1.441905,1.88972,1.441567,1.892517,0.160395,0.001935,0.003625,0.035298
2,4.258252,1.443462,1.893943,2.027107,0.001,10,20,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",29,...,1.446228,1.898862,1.44236,1.890189,1.441798,1.892778,0.126091,0.001969,0.003635,0.027138
3,12.723315,1.393608,1.8477,3.873046,0.001,30,3,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",16,...,1.396592,1.852856,1.392762,1.844165,1.39147,1.846081,0.65171,0.002175,0.003728,0.116138
4,13.935749,1.394309,1.848475,3.475496,0.001,30,10,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",19,...,1.397356,1.853801,1.39338,1.84484,1.39219,1.846783,3.832739,0.002209,0.003849,0.867926
5,21.598279,1.395377,1.849779,3.072328,0.001,30,20,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",21,...,1.398548,1.855182,1.394306,1.846035,1.393276,1.84812,4.823715,0.002282,0.003914,0.856226
6,21.807308,1.37247,1.829914,3.10365,0.001,50,3,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",11,...,1.374688,1.834163,1.371895,1.826524,1.370826,1.829054,5.528891,0.001628,0.003177,0.854887
7,39.551223,1.375008,1.833723,4.137668,0.001,50,10,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",12,...,1.377836,1.839173,1.374014,1.829371,1.373174,1.832626,6.801442,0.002029,0.004076,0.278305
8,50.591306,1.379225,1.838906,4.015861,0.001,50,20,0.01,"{'reg_all': 0.01, 'lr_all': 0.001, 'n_epochs':...",13,...,1.382005,1.843999,1.379442,1.835932,1.376228,1.836787,2.172353,0.002364,0.003618,0.968572
9,6.073208,1.369325,1.84622,3.408165,0.01,10,3,0.01,"{'reg_all': 0.01, 'lr_all': 0.01, 'n_epochs': ...",7,...,1.374443,1.853921,1.367348,1.841186,1.366184,1.843553,2.073764,0.00365,0.00553,1.333342


In [43]:
#default 
'test_mae': array([1.38723197, 1.39241735, 1.38574688]),
'test_rmse': array([1.85676068, 1.86323958, 1.85731211]),

20

In [49]:
ratings_users = set(ratings.user_uid.unique())
test_users_with_ratings = list(test_users & ratings_users)

In [54]:
movie_list = list(ratings.element_uid.unique())

In [106]:
top_n = defaultdict(list)
n = 500

for uid in ['571252', '63140', '443817', '359870']:

    #tqdm.tqdm(test_users_with_ratings):

    # First map the predictions to each user.
    user_ratings = [(iid, algo.predict(uid, iid, verbose=False)[3]) for iid in movie_list]
    
    # Then sort the predictions for each user and retrieve the k highest ones.
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_n[uid] = user_ratings[:n]

100%|██████████| 13251/13251 [32:37<00:00,  5.93it/s]


In [133]:
#['571252', '63140', '443817', '359870']:
    
    
algo.predict('571252', '1364', verbose=False)[3]
#9.442963984462768
#algo.predict('63140', '1364', verbose=False)[3]
#8.878428447496999
#algo.predict('443817', '1364', verbose=False)[3]
#8.704440941004862
#algo.predict('359870', '1364', verbose=False)[3]
#9.895491469075125


5

In [103]:
from collections import defaultdict

filtered_elements = defaultdict(set)

for user_uid, element_uid in tqdm.tqdm(transactions.loc[:, ['user_uid', 'element_uid']].values):
    if user_uid not in test_users:
        continue
    filtered_elements[user_uid].add(element_uid)

100%|██████████| 9643012/9643012 [00:18<00:00, 531231.45it/s]


In [107]:
result = {}
for user_uid in tqdm.tqdm(test_users_with_ratings):
    result[user_uid] = [int(x) for x, _ in top_n[user_uid] if x not in filtered_elements[user_uid]][:20]

100%|██████████| 13251/13251 [00:03<00:00, 4362.38it/s]


In [108]:
with open('./predictions/svd_surprise_strings.json', 'w') as f:
    json.dump(result, f)