## Импорт библиотек и константы

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import json
import pandas as pd
import numpy as np
import optuna
from datetime import timedelta
from tqdm.auto import tqdm
from lightfm import LightFM
from implicit.nearest_neighbours import TFIDFRecommender, BM25Recommender, CosineRecommender
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.model_selection import TimeRangeSplitter
from rectools.models import PopularModel, LightFMWrapperModel,ImplicitItemKNNWrapperModel
from rectools.metrics import MAP, NDCG, Recall, Serendipity, calc_metrics

In [3]:
K_RECOMMENDATIONS = 50
RANDOM_STATE = 0
NUM_LOGICAL_PROCESSORS = 10

## Загрузка данных

In [4]:
interactions_df = pd.read_csv("new_data/base_models_data.csv")

In [5]:
test_df = pd.read_csv("new_data/ranker_test_data.csv")

In [6]:
test_users = test_df[Columns.User].values

In [7]:
test_without_cold_users = list(set(test_df[Columns.User].values).intersection(set(interactions_df[Columns.User].values)))

## Подготовка датасета и метрик

In [8]:
dataset = Dataset.construct(interactions_df)

In [9]:
interactions = Interactions(interactions_df)

In [10]:
splitter = TimeRangeSplitter(
    test_size="14D",
    n_splits=1,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True
)

In [11]:
metrics = {
    'ndcg@10': NDCG(k = 10),
    'map@10': MAP(k = 10),
    'recall@10': Recall(k = 10),
    'serendipity@10': Serendipity(k=10)
}

In [12]:
models_results_df = pd.DataFrame(index=list(metrics.keys()))

In [13]:
def train_and_calculate_score(model, interactions, splitter, metrics, head_metric="map@10"):
    fold_iterator = splitter.split(interactions, collect_fold_stats=True)
    
    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        df_train = interactions.df.iloc[train_ids].copy()
        df_test = interactions.df.iloc[test_ids].copy()
        test_users = df_test[Columns.User].unique()
        catalog = df_train[Columns.Item].unique()
        
        dataset = Dataset.construct(df_train)
        model.fit(dataset)
        recos = model.recommend(test_users, dataset, k=K_RECOMMENDATIONS, filter_viewed=True)
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
        
    return metric_values[head_metric]

## Обучение LightFM

In [14]:
LIGHTFM_K_RECOMMENDATIONS = 100
LIGHTFM_N_TRIALS = 40

In [15]:
def lightfm_objective(trial: optuna.Trial):
    global interactions, splitter, metrics

    loss = trial.suggest_categorical("loss", ['bpr', 'warp'])
    no_components = trial.suggest_categorical("no_components", [100, 128, 256, 512])
    learning_rate = trial.suggest_categorical("learning_rate", [1e-2, 5e-2, 1e-1])
    epochs = trial.suggest_categorical("epochs", [3, 6, 9, 12])
    learning_schedule = trial.suggest_categorical("learning_schedule", ['adagrad', 'adadelta'])
    max_sampled = trial.suggest_categorical("max_sampled", [5, 10, 20, 40])
    
    lightfm_model = LightFMWrapperModel(
        model=LightFM(
            no_components=no_components,
            loss=loss,
            learning_rate=learning_rate,
            learning_schedule=learning_schedule,
            max_sampled=max_sampled,
            random_state=RANDOM_STATE,
        ),
        num_threads=NUM_LOGICAL_PROCESSORS,
        epochs=epochs,
        verbose=False
    )
    
    score = train_and_calculate_score(lightfm_model, interactions, splitter, metrics)
    return score

In [16]:
lightfm_study = optuna.create_study(direction="maximize", study_name="LightFM", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.NopPruner())
lightfm_study.optimize(lightfm_objective, n_trials=LIGHTFM_N_TRIALS) 

[I 2023-12-27 08:15:16,080] A new study created in memory with name: LightFM
[I 2023-12-27 08:16:13,498] Trial 0 finished with value: 9.082256573297112e-05 and parameters: {'loss': 'bpr', 'no_components': 512, 'learning_rate': 0.1, 'epochs': 6, 'learning_schedule': 'adagrad', 'max_sampled': 20}. Best is trial 0 with value: 9.082256573297112e-05.
[I 2023-12-27 08:17:28,312] Trial 1 finished with value: 0.12028451260820126 and parameters: {'loss': 'warp', 'no_components': 512, 'learning_rate': 0.01, 'epochs': 12, 'learning_schedule': 'adadelta', 'max_sampled': 10}. Best is trial 1 with value: 0.12028451260820126.
[I 2023-12-27 08:18:53,509] Trial 2 finished with value: 0.027582407980469358 and parameters: {'loss': 'bpr', 'no_components': 512, 'learning_rate': 0.01, 'epochs': 12, 'learning_schedule': 'adagrad', 'max_sampled': 5}. Best is trial 1 with value: 0.12028451260820126.
[I 2023-12-27 08:19:57,474] Trial 3 finished with value: 0.12360985289323544 and parameters: {'loss': 'warp', 'n

In [17]:
print("Параметры лучшей модели: ", lightfm_study.best_params)
print("MAP@10 лучшей модели: ", lightfm_study.best_value)

Параметры лучшей модели:  {'loss': 'warp', 'no_components': 256, 'learning_rate': 0.1, 'epochs': 6, 'learning_schedule': 'adadelta', 'max_sampled': 5}
MAP@10 лучшей модели:  0.1367656917270343


In [18]:
best_lightfm_params = lightfm_study.best_params.copy()
epochs = best_lightfm_params["epochs"]
del best_lightfm_params["epochs"]
best_lightfm_model = LightFMWrapperModel(
    model=LightFM(**best_lightfm_params),
    num_threads=NUM_LOGICAL_PROCESSORS,
    epochs=epochs,
    verbose=False
)
best_lightfm_model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f1440c839a0>

In [19]:
lightfm_reco_df = best_lightfm_model.recommend(test_without_cold_users, dataset, k=LIGHTFM_K_RECOMMENDATIONS, filter_viewed=True)

In [20]:
lightfm_reco_df.head()

Unnamed: 0,user_id,item_id,score,rank
0,262147,15297,3.274814,1
1,262147,10440,3.196383,2
2,262147,13865,3.142883,3
3,262147,3734,2.921471,4
4,262147,9728,2.902673,5


In [21]:
lightfm_metrics = calc_metrics(
    metrics=metrics,
    reco=lightfm_reco_df,
    interactions=test_df, 
    prev_interactions=interactions_df,
    catalog=test_df['item_id'].unique()
)
models_results_df["LightFM"] = [lightfm_metrics[metric] for metric in models_results_df.index]

In [22]:
del lightfm_reco_df

## Обучение UserKNN

In [23]:
USERKNN_K_RECOMMENDATIONS = 100
USERKNN_N_TRIALS = 10

In [24]:
def userknn_objective(trial: optuna.Trial):
    global interactions, splitter, metrics

    recommender_type = trial.suggest_categorical("type", ['cosine', 'bm25', 'tfidf'])
    if recommender_type == 'cosine':
        recommender = CosineRecommender(
            K=USERKNN_K_RECOMMENDATIONS,
            num_threads=NUM_LOGICAL_PROCESSORS,
        )
    elif recommender_type == 'bm25':
        k1 = trial.suggest_categorical("k1", [0.8, 1.2, 1.5])
        b = trial.suggest_categorical("b", [0.3, 0.7, 1])
        recommender = BM25Recommender(
            K=USERKNN_K_RECOMMENDATIONS,
            num_threads=NUM_LOGICAL_PROCESSORS,
            K1=k1, 
            B=b
        )
    else:
        recommender = TFIDFRecommender(
            K=USERKNN_K_RECOMMENDATIONS,
            num_threads=NUM_LOGICAL_PROCESSORS,
        )
    
    userknn_model = ImplicitItemKNNWrapperModel(
        recommender,
        verbose=False
    )          
    score = train_and_calculate_score(userknn_model, interactions, splitter, metrics)
    return score

In [25]:
userknn_study = optuna.create_study(direction="maximize", study_name="UserKNN", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.NopPruner())
userknn_study.optimize(userknn_objective, n_trials=USERKNN_N_TRIALS) 

[I 2023-12-27 08:42:17,725] A new study created in memory with name: UserKNN
[I 2023-12-27 08:42:58,230] Trial 0 finished with value: 0.08549164715082347 and parameters: {'type': 'tfidf'}. Best is trial 0 with value: 0.08549164715082347.
[I 2023-12-27 08:43:38,359] Trial 1 finished with value: 0.13154118191733294 and parameters: {'type': 'bm25', 'k1': 0.8, 'b': 0.3}. Best is trial 1 with value: 0.13154118191733294.
[I 2023-12-27 08:44:19,094] Trial 2 finished with value: 0.08549164715082347 and parameters: {'type': 'tfidf'}. Best is trial 1 with value: 0.13154118191733294.
[I 2023-12-27 08:44:59,680] Trial 3 finished with value: 0.08549164715082347 and parameters: {'type': 'tfidf'}. Best is trial 1 with value: 0.13154118191733294.
[I 2023-12-27 08:45:39,915] Trial 4 finished with value: 0.12016716979597941 and parameters: {'type': 'bm25', 'k1': 1.2, 'b': 0.7}. Best is trial 1 with value: 0.13154118191733294.
[I 2023-12-27 08:46:20,543] Trial 5 finished with value: 0.07885556882590268 a

In [26]:
print("Параметры лучшей модели: ", userknn_study.best_params)
print("MAP@10 лучшей модели: ", userknn_study.best_value)

Параметры лучшей модели:  {'type': 'bm25', 'k1': 0.8, 'b': 0.3}
MAP@10 лучшей модели:  0.13154118191733294


In [27]:
best_userknn_model = ImplicitItemKNNWrapperModel(
    model = BM25Recommender(
        K=USERKNN_K_RECOMMENDATIONS,
        num_threads=NUM_LOGICAL_PROCESSORS,
        K1=0.8, 
        B=0.3
    ),
    verbose=False
)
best_userknn_model.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7f1440c837f0>

In [28]:
userknn_reco_df = best_userknn_model.recommend(test_without_cold_users, dataset, k=USERKNN_K_RECOMMENDATIONS, filter_viewed=True)

In [29]:
userknn_reco_df.head()

Unnamed: 0,user_id,item_id,score,rank
0,262147,10440,26818580.0,1
1,262147,15297,21079920.0,2
2,262147,4151,18060020.0,3
3,262147,13865,15028910.0,4
4,262147,4880,12807470.0,5


In [30]:
userknn_metrics = calc_metrics(
    metrics=metrics,
    reco=userknn_reco_df,
    interactions=test_df, 
    prev_interactions=interactions_df,
    catalog=test_df['item_id'].unique()
)
models_results_df["UserKNN_BM25"] = [userknn_metrics[metric] for metric in models_results_df.index]

In [31]:
del userknn_reco_df

## Обучение Популярной модели

In [47]:
POPULAR_K_RECOMMENDATIONS = 50
POPULAR_N_TRIALS = 10

In [33]:
def popular_objective(trial: optuna.Trial):
    global interactions, splitter, metrics

    popularity = trial.suggest_categorical("popularity", ['n_users', 'mean_weight', 'sum_weight'])
    period = trial.suggest_categorical("period", [timedelta(days=7), timedelta(days=14), timedelta(days=30)])
    
    popular_model = PopularModel(
        popularity = popularity,
        period = period
    )        
    score = train_and_calculate_score(popular_model, interactions, splitter, metrics)
    return score

In [34]:
popular_study = optuna.create_study(direction="maximize", study_name="Popular", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.NopPruner())
popular_study.optimize(popular_objective, n_trials=POPULAR_N_TRIALS) 

[I 2023-12-27 08:54:08,858] A new study created in memory with name: Popular
[I 2023-12-27 08:54:30,582] Trial 0 finished with value: 0.13105024810721633 and parameters: {'popularity': 'n_users', 'period': datetime.timedelta(days=30)}. Best is trial 0 with value: 0.13105024810721633.
[I 2023-12-27 08:54:52,018] Trial 1 finished with value: 8.270188438967508e-05 and parameters: {'popularity': 'mean_weight', 'period': datetime.timedelta(days=30)}. Best is trial 0 with value: 0.13105024810721633.
[I 2023-12-27 08:55:13,656] Trial 2 finished with value: 0.13721913676170075 and parameters: {'popularity': 'sum_weight', 'period': datetime.timedelta(days=30)}. Best is trial 2 with value: 0.13721913676170075.
[I 2023-12-27 08:55:35,206] Trial 3 finished with value: 8.145136775705791e-05 and parameters: {'popularity': 'mean_weight', 'period': datetime.timedelta(days=14)}. Best is trial 2 with value: 0.13721913676170075.
[I 2023-12-27 08:55:56,875] Trial 4 finished with value: 0.13721913676170075

In [35]:
print("Параметры лучшей модели: ", popular_study.best_params)
print("MAP@10 лучшей модели: ", popular_study.best_value)

Параметры лучшей модели:  {'popularity': 'n_users', 'period': datetime.timedelta(days=14)}
MAP@10 лучшей модели:  0.13764120892454246


In [36]:
best_popular_model = PopularModel(
    **popular_study.best_params
)
best_popular_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7f1440c84940>

In [37]:
popular_reco_df = best_popular_model.recommend(test_without_cold_users, dataset, k=POPULAR_K_RECOMMENDATIONS, filter_viewed=True)

In [38]:
popular_reco_df.head()

Unnamed: 0,user_id,item_id,score,rank
0,262147,15297,31879.0,1
1,262147,10440,23793.0,2
2,262147,9728,22085.0,3
3,262147,13865,19712.0,4
4,262147,3734,16244.0,5


In [39]:
popular_metrics = calc_metrics(
    metrics=metrics,
    reco=popular_reco_df,
    interactions=test_df, 
    prev_interactions=interactions_df,
    catalog=test_df['item_id'].unique()
)
models_results_df["Popular"] = [popular_metrics[metric] for metric in models_results_df.index]

In [40]:
del popular_reco_df

### Оценки лучших моделей на тестовых данных

In [41]:
models_results_df

Unnamed: 0,LightFM,UserKNN_BM25,Popular
ndcg@10,0.032116,0.032472,0.035194
map@10,0.042491,0.042748,0.047337
recall@10,0.083713,0.081155,0.096176
serendipity@10,0.000253,0.00015,7.5e-05


## Генерация рекомендаций для моделей второго уровня и сохранение

In [42]:
lightfm_df = best_lightfm_model.recommend(interactions_df[Columns.User].unique(), dataset, k=LIGHTFM_K_RECOMMENDATIONS, filter_viewed=True)
lightfm_df.to_csv("new_data/lightfm_recommendations.csv", index=False)
del lightfm_df

In [43]:
userknn_df = best_userknn_model.recommend(interactions_df[Columns.User].unique(), dataset, k=USERKNN_K_RECOMMENDATIONS, filter_viewed=True)
userknn_df.to_csv("new_data/userknn_recommendations.csv", index=False)
del userknn_df

In [None]:
popular_df = best_popular_model.recommend(interactions_df[Columns.User].unique()[:1], dataset, k=POPULAR_K_RECOMMENDATIONS, filter_viewed=True)
popular_df.to_csv("new_data/popular_recommendations.csv", index=False)

Популярные генерируем только для одного пользователя, так как они не особо отличаются

In [46]:
models_results_df.to_csv("new_data/first_stage_models.csv")