In [1]:
import warnings
warnings.filterwarnings('ignore')

%load_ext autotime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

import random

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import recall_score, f1_score
from scipy.sparse import csr_matrix

from implicit.als import AlternatingLeastSquares

from surprise import Dataset, Reader
import surprise.model_selection
#from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, SVDpp, KNNBasic, KNNBaseline

from catboost import CatBoostClassifier, CatBoostRanker, Pool

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

from tqdm import tqdm

import itertools

import optuna

time: 2.18 s (started: 2024-10-26 01:29:21 +07:00)


In [None]:
events = pd.read_csv('events.csv')
user_ids = np.arange(6040)
item_ids = np.arange(3706)
combinations = list(itertools.product(user_ids, item_ids))
data_for_pred = pd.DataFrame(combinations, columns=['user_id', 'item_id'])

In [92]:
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(events[['user_id', 'item_id', 'rating']], reader)

trainset = surprise_data.build_full_trainset()

time: 565 ms (started: 2024-10-25 06:46:52 +07:00)


In [93]:
svd_model = SVDpp(n_factors = 50, reg_all = 0.05, n_epochs=10)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x48f5b9a10>

time: 2min 13s (started: 2024-10-25 06:46:54 +07:00)


In [104]:
tqdm.pandas()
data_for_pred['svd_score'] = data_for_pred.progress_apply(lambda row: svd_model.predict(row['user_id'], row['item_id']).est, axis=1)

100%|██████████| 22384240/22384240 [35:57<00:00, 10373.42it/s] 

time: 35min 57s (started: 2024-10-25 06:52:32 +07:00)





In [107]:
cv_svd_basic = surprise.model_selection.cross_validate(svd_model, surprise_data, n_jobs=-1)
print(f"RMSE SVDpp-модели: {np.round(np.mean(cv_svd_basic['test_rmse']), 4)}")
print(f"MAE SVDpp-модели: {np.round(np.mean(cv_svd_basic['test_mae']), 4)}")

RMSE SVDpp-модели: 0.8823
MAE SVDpp-модели: 0.6971
time: 6min 28s (started: 2024-10-25 07:29:09 +07:00)


In [108]:
sim_options_user = {'name': 'cosine', 'user_based': True}

model_knn_user = KNNBasic(sim_options=sim_options_user, verbose=0)
model_knn_user.fit(trainset)

cv_knn_basic = surprise.model_selection.cross_validate(model_knn_user, surprise_data, n_jobs=-1)
print(f"RMSE user-based kNN-модели: {np.round(np.mean(cv_knn_basic['test_rmse']), 4)}")
print(f"MAE user-based kNN-модели: {np.round(np.mean(cv_knn_basic['test_mae']), 4)}")

RMSE user-based kNN-модели: 0.9779
MAE user-based kNN-модели: 0.7724
time: 3min 45s (started: 2024-10-25 07:35:38 +07:00)


In [109]:
sim_options_item = {'name': 'cosine', 'user_based': False}

model_knn_item = KNNBasic(sim_options=sim_options_item, verbose=0)
model_knn_item.fit(trainset)

cv_knn_basic = surprise.model_selection.cross_validate(model_knn_item, surprise_data, n_jobs=-1)
print(f"RMSE item-based kNN-модели: {np.round(np.mean(cv_knn_basic['test_rmse']), 4)}")
print(f"MAE item-based kNN-модели: {np.round(np.mean(cv_knn_basic['test_mae']), 4)}")

RMSE item-based kNN-модели: 1.001
MAE item-based kNN-модели: 0.7833
time: 1min 23s (started: 2024-10-25 07:39:23 +07:00)


In [110]:
user_neighbors_dict = {}
for user_id in np.arange(6040):
    try:
        inner_user_id = trainset.to_inner_uid(user_id)  
        user_neighbors_dict[user_id] = model_knn_user.get_neighbors(inner_user_id, k=5)
    except ValueError:
        user_neighbors_dict[user_id] = [] 

def calculate_avg_neighbor_ratings():
    user_neighbor_ratings = {}

    for user_id in tqdm(np.arange(6040)):
        neighbors = [trainset.to_raw_uid(n) for n in user_neighbors_dict[user_id]]  
        neighbor_ratings = events[events['user_id'].isin(neighbors)]

        for item_id in np.arange(3706):
            ratings = neighbor_ratings[neighbor_ratings['item_id'] == item_id]['rating']
            user_neighbor_ratings[(user_id, item_id)] = ratings.mean() if len(ratings) > 0 else np.nan
    
    return user_neighbor_ratings

time: 10.7 s (started: 2024-10-25 07:40:46 +07:00)


In [111]:
item_neighbors_dict = {}
for item_id in np.arange(3706):
    try:
        inner_item_id = trainset.to_inner_iid(item_id)  
        item_neighbors_dict[item_id] = model_knn_item.get_neighbors(inner_item_id, k=5)
    except ValueError:
        item_neighbors_dict[item_id] = []  

def calculate_avg_similar_item_ratings():
    item_similar_ratings = {}

    for user_id in tqdm(np.arange(6040)):
        user_ratings = events[events['user_id'] == user_id]

        for item_id in np.arange(3706):
            neighbors = [trainset.to_raw_iid(n) for n in item_neighbors_dict[item_id]]  
            ratings = user_ratings[user_ratings['item_id'].isin(neighbors)]['rating']
            item_similar_ratings[(user_id, item_id)] = ratings.mean() if len(ratings) > 0 else np.nan
    
    return item_similar_ratings

time: 3.99 s (started: 2024-10-25 07:40:57 +07:00)


In [112]:
avg_neighbor_ratings = calculate_avg_neighbor_ratings()
avg_similar_ratings = calculate_avg_similar_item_ratings()

100%|██████████| 6040/6040 [39:51<00:00,  2.53it/s]
100%|██████████| 6040/6040 [45:35<00:00,  2.21it/s]

time: 1h 25min 27s (started: 2024-10-25 07:41:01 +07:00)





In [113]:
import pickle 

with open('avg_neighbor_ratings.pkl', 'wb') as f:
    pickle.dump(avg_neighbor_ratings, f)

with open('avg_similar_ratings.pkl', 'wb') as f:
    pickle.dump(avg_similar_ratings, f)

time: 3min 34s (started: 2024-10-25 09:06:28 +07:00)


In [114]:
tqdm.pandas()

data_for_pred['avg_neighbor_rating'] = data_for_pred.progress_apply(lambda row: avg_neighbor_ratings.get((row['user_id'], row['item_id'])), axis=1)
data_for_pred['avg_similar_item_rating'] = data_for_pred.progress_apply(lambda row: avg_similar_ratings.get((row['user_id'], row['item_id'])), axis=1)

100%|██████████| 22384240/22384240 [03:03<00:00, 121778.82it/s]
100%|██████████| 22384240/22384240 [03:05<00:00, 120953.67it/s]

time: 6min 9s (started: 2024-10-25 09:10:03 +07:00)





In [115]:
data_for_pred[['user_id', 'item_id', 'svd_score', 'avg_neighbor_rating', 'avg_similar_item_rating']].to_csv('surprise_scores.csv', index = False)

time: 50.9 s (started: 2024-10-25 09:16:12 +07:00)
