# Preparations

In [1]:
%%capture

!pip install implicit
!pip install rectools==0.4.2
!pip install lightfm
!pip install nmslib
!pip install optuna

In [2]:
import implicit

implicit.gpu.HAS_CUDA

True

In [4]:
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"
import warnings

warnings.filterwarnings("ignore")

In [5]:
import typing as tp
import requests
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import (
    MAP,
    NDCG,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel, PopularModel
from rectools.tools import UserToItemAnnRecommender
from tqdm import tqdm

In [6]:
import optuna
from optuna.samplers import TPESampler

optuna.logging.set_verbosity(optuna.logging.INFO)

# Loading data

In [7]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:  92%|█████████▏| 72.4M/78.8M [00:01<00:00, 128MiB/s] 

In [8]:
!unzip kion_train.zip -x '__MACOSX/*'

Archive:  kion_train.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: data_original/users.csv  
  inflating: data_original/items.csv  


In [9]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

# interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
#                                 'total_dur': Columns.Weight}, inplace=True)
# # will cast types and save new pd.DataFrame inside in Interactions.df
# interactions = Interactions(interactions_df)

In [10]:
interactions.sample(5)

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
4751625,524111,4155,2021-05-21,957,1.0
1117627,40078,10440,2021-06-16,19042,30.0
217633,1059465,7102,2021-06-28,1605,30.0
4449840,307752,15751,2021-05-12,77,1.0
1011364,965155,2993,2021-05-23,15,0.0


In [11]:
Columns.Datetime = "last_watch_dt"

In [12]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format="%Y-%m-%d")
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [13]:
# разделим датасет на три части: на валидации будем подбирать гиперпараметры, на тесте финально сравнивать модели
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

In [14]:
train.drop(train.query("total_dur < 300").index, inplace=True)

# отфильтруем пользователей, которых нет в тесте, но которые встречаются в трейне (cold_users)
test[Columns.User]

cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

TEST_USERS = test[Columns.User].unique()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (3832711, 6)
test: (333026, 6)


In [15]:
len(cold_users)

72930

In [16]:
len(TEST_USERS)

112977

## Prepare features

#Users

In [17]:
users.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0


In [18]:
def get_user_features(users, interactions, features):
    users.fillna("Unknown", inplace=True) #заполним пропуски

    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []

    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [19]:
user_features_list = ["sex", "age", "income"]

In [20]:
user_features = get_user_features(users, train, user_features_list)

In [21]:
user_features

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex
...,...,...,...
840184,529394,income_40_60,income
840186,80113,income_40_60,income
840188,312839,income_60_90,income
840189,191349,income_40_60,income


In [22]:
user_features[user_features['id'] == 973171]

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


In [23]:
items.head(3)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


#Items

In [24]:
def get_item_features(items, interactions):
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()

    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"

    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    item_features = pd.concat((genre_feature, content_feature))
    return item_features

In [25]:
item_features = get_item_features(items, train)

In [26]:
item_features.head(5)

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [27]:
item_features[item_features['id'] == 10711]

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
0,10711,film,content_type


In [28]:
%%time

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 917 ms, sys: 42.2 ms, total: 959 ms
Wall time: 977 ms


## Metrics

In [29]:
metrics_name = {
    "precision": Precision,
    "recall": Recall,
    "MAP": MAP,
    # "NDCG": NDCG,
    # "novelty": MeanInvUserFreq,
    # "serendipity": Serendipity,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in [1, 5, 10]:
        metrics[f"{metric_name}@{k}"] = metric(k=k)

In [30]:
metrics

{'precision@1': Precision(k=1),
 'precision@5': Precision(k=5),
 'precision@10': Precision(k=10),
 'recall@1': Recall(k=1),
 'recall@5': Recall(k=5),
 'recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False)}

# Models

In [31]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (32,)
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

metric = MAP(k=10)

В качестве модели беру ALS

Подберем гиперпараметры с помощью Optuna:

In [None]:
def optuna_objective_als(trial):
    test_users = test[Columns.User].unique()

    # зададим параметры
    factors = trial.suggest_categorical("n_factors", [8, 16, 32])
    num_threads = trial.suggest_int("num_threads", 1, 3)
    regularization = trial.suggest_float('regularization', 0.001, 0.1, log=True)
    iterations = trial.suggest_int('iterations', 5, 20)
    fit_features_together = trial.suggest_categorical("fit_features_together", [True, False])

    model_als = AlternatingLeastSquares(factors=factors,
                                    random_state=RANDOM_STATE,
                                    num_threads=num_threads,
                                    iterations=iterations,
                                    regularization=regularization
                                    )

    model = ImplicitALSWrapperModel(model_als,
                                    verbose=1,
                                    fit_features_together=fit_features_together,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values["MAP@10"]

In [None]:
sampler = TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(optuna_objective_als, n_trials=20, n_jobs=-1, gc_after_trial=True)

In [None]:
print(f'Лучшее значение MAP@10: {study.best_value}')
print(f'Лучшие параметры: {study.best_params}')

Лучшее значение MAP@10: 0.07705951189824024
Лучшие параметры: {'n_factors': 32, 'num_threads': 2, 'regularization': 0.004152179888032442, 'iterations': 14, 'fit_features_together': True}


In [None]:
def optuna_objective_lf(trial):
    test_users = test[Columns.User].unique()

    # зададим параметры
    no_components = trial.suggest_categorical("n_factors", [8, 16, 32, 64])
    loss = trial.suggest_categorical("loss", ["logistic", "bpr", "warp"])
    learning_rate = trial.suggest_float("lr", 1e-3, 1e-1, log=True)
    num_threads = trial.suggest_int("num_threads", 1, 3)
    user_alpha = trial.suggest_float("user_alpha", 0, 1)
    item_alpha = trial.suggest_float("item_alpha", 0, 1)

    model = LightFMWrapperModel(
        LightFM(
            no_components=no_components,
            loss=loss,
            random_state=RANDOM_STATE,
            learning_rate=learning_rate,
            user_alpha=user_alpha,
            item_alpha=item_alpha,
        ),
        epochs=N_EPOCHS,
        num_threads=num_threads,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)

    return metric_values["MAP@10"]

In [None]:
sampler = TPESampler(seed=1)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(optuna_objective_lf, n_trials=20, n_jobs=-1, gc_after_trial=True)

[I 2023-12-12 11:19:20,159] A new study created in memory with name: no-name-eed9b4e7-702c-4104-93fd-6cc6b1230a44
[I 2023-12-12 11:20:24,546] Trial 0 finished with value: 0.0 and parameters: {'n_factors': 8, 'loss': 'bpr', 'lr': 0.002023911547026897, 'num_threads': 1, 'user_alpha': 0.22917740957467525, 'item_alpha': 0.17663591990698913}. Best is trial 0 with value: 0.0.
[I 2023-12-12 11:20:53,605] Trial 1 finished with value: 6.064876061329331e-07 and parameters: {'n_factors': 32, 'loss': 'bpr', 'lr': 0.014024858750701568, 'num_threads': 2, 'user_alpha': 0.943136891202081, 'item_alpha': 0.8700960356580814}. Best is trial 1 with value: 6.064876061329331e-07.
[I 2023-12-12 11:21:47,255] Trial 3 finished with value: 0.00018003579082063824 and parameters: {'n_factors': 32, 'loss': 'logistic', 'lr': 0.002636659855303529, 'num_threads': 3, 'user_alpha': 0.02547919604957638, 'item_alpha': 0.9311540291721854}. Best is trial 3 with value: 0.00018003579082063824.
[I 2023-12-12 11:23:02,184] Tria

In [None]:
print(f'Лучшее значение MAP@10: {study.best_value}')
print(f'Лучшие параметры: {study.best_params}')

Лучшее значение MAP@10: 0.0749323687314942
Лучшие параметры: {'n_factors': 64, 'loss': 'warp', 'lr': 0.003970848900031132, 'num_threads': 1, 'user_alpha': 0.1381916442865091, 'item_alpha': 0.829459703354334}


# Cross-validation

Сравниваем ALS, LightFM и Popular:

In [32]:
models = {
    "popular": PopularModel(),
    "ALS": ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=32,
            random_state=RANDOM_STATE,
            num_threads=2,
            regularization=0.004152179888032442,
            iterations=14
        ),
        fit_features_together=True,
    ),
    "LF": LightFMWrapperModel(
        LightFM(
            no_components=64,
            loss="warp",
            random_state=RANDOM_STATE,
            learning_rate=0.003970848900031132,
            user_alpha=0.1381916442865091,
            item_alpha=0.829459703354334,
        ),
        epochs=N_EPOCHS,
        num_threads=1,
    ),
}

## Splitter

In [None]:
TEST_SIZE = "7D"
N_SPLITS = 5

In [None]:
splitter = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [None]:
splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2021-07-11 00:00:00', freq='7D'),
  Timestamp('2021-07-18 00:00:00', freq='7D')),
 (Timestamp('2021-07-18 00:00:00', freq='7D'),
  Timestamp('2021-07-25 00:00:00', freq='7D')),
 (Timestamp('2021-07-25 00:00:00', freq='7D'),
  Timestamp('2021-08-01 00:00:00', freq='7D')),
 (Timestamp('2021-08-01 00:00:00', freq='7D'),
  Timestamp('2021-08-08 00:00:00', freq='7D')),
 (Timestamp('2021-08-08 00:00:00', freq='7D'),
  Timestamp('2021-08-15 00:00:00', freq='7D'))]

## Cross-val

In [None]:
results = cross_validate(dataset, splitter, metrics, models, k=10, filter_viewed=True)

In [None]:
df_quality = (
    pd.DataFrame.from_dict(results["metrics"]).groupby("model").mean().drop("i_split", axis=1).T
)

In [None]:
df_quality.style.highlight_max(color="lightgreen", axis=1)

model,ALS,LF,popular
precision@1,0.088242,0.083193,0.081874
recall@1,0.054418,0.053097,0.050963
precision@5,0.053496,0.053088,0.056047
recall@5,0.153975,0.153566,0.161825
precision@10,0.034597,0.034977,0.036284
recall@10,0.19335,0.196863,0.20379
MAP@1,0.054418,0.053097,0.050963
MAP@5,0.092473,0.089525,0.092119
MAP@10,0.098844,0.096646,0.099115


Как сложно побить популярное)))

#Тренируем

In [33]:
user_features = get_user_features(users, interactions, ["sex", "age", "income"])
item_features = get_item_features(items, interactions)

In [34]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [35]:
model = LightFMWrapperModel(
        LightFM(
            no_components=64,
            loss="warp",
            random_state=RANDOM_STATE,
            learning_rate=0.003970848900031132,
            user_alpha=0.1381916442865091,
            item_alpha=0.829459703354334,
        ),
        epochs=N_EPOCHS,
        num_threads=1,
    )

In [36]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7da18924a5c0>

Запишем рекомендации в файл, если вдруг с моделью что-то пойдет не так

In [None]:
ALL_USERS = interactions[Columns.User].unique()

In [44]:
interactions[interactions[Columns.User] == 176549][Columns.User].unique()

array([176549])

In [None]:
all_recos = model.recommend(
    users=ALL_USERS,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)[[Columns.User, Columns.Item]]

In [None]:
all_recos[all_recos['user_id'] == 176549]

Unnamed: 0,user_id,item_id
0,176549,10440
1,176549,15297
2,176549,4151
3,176549,2657
4,176549,3734
5,176549,13865
6,176549,8636
7,176549,4880
8,176549,6809
9,176549,4740


In [None]:
all_recos.to_csv("/content/LF_recos.csv")

# ANN

In [None]:
user_vectors, item_vectors = model.get_vectors(dataset)
ann_lf = UserToItemAnnRecommender(
    user_vectors=user_vectors,
    item_vectors=item_vectors,
    user_id_map=dataset.user_id_map,
    item_id_map=dataset.item_id_map,
)
ann_lf.fit()

In [None]:
ann_lf.get_item_list_for_user(176549, top_n=10).tolist()

In [37]:
import pickle

pickle.dump(model, open("/content/LF_model.pkl", "wb"))

In [39]:
model_lf = pickle.load(open("/content/LF_model.pkl", "rb"))

In [76]:
user_list = [176549]

In [None]:
reco = model_lf.recommend(users=user_list,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)['item_id'].values.tolist()

In [72]:
reco

[10440, 15297, 4151, 2657, 3734, 13865, 8636, 4880, 6809, 4740]

In [None]:
model_lf.get_item_list_for_user(176549, top_n=10).tolist()