# Imports

In [35]:
%pip install rectools[lightfm] >>None
%pip install --upgrade rectools[all] >>None
%pip install Jinja2 >>None
%pip install optuna >>None
%pip install nmslib >>None
%pip install faiss-cpu >>None


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;4

In [36]:
import pandas as pd
import numpy as np

from rectools.metrics import MAP, calc_metrics
from rectools.models import PopularModel, RandomModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import LightFMWrapperModel

from pathlib import Path
from tqdm import tqdm

from lightfm import LightFM

# Получение и предобработка всех данных

In [37]:
interactions = pd.read_csv('../data_original/interactions.csv')
users = pd.read_csv('../data_original/users.csv')
items = pd.read_csv('../data_original/items.csv')

interactions.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [38]:
# Преобразование столбца 'last_watch_dt' в формат datetime
interactions['last_watch_dt'] = pd.to_datetime(interactions['last_watch_dt'], errors='coerce')
# Удаление строк с некорректными значениями даты
interactions.dropna(subset=['last_watch_dt'], inplace=True)
interactions = interactions.rename(columns={'last_watch_dt': 'datetime'})
# Присвоение веса в зависимости от условия
interactions['weight'] = np.where(interactions['watched_pct'] > 10, 3, 1)
# Разделение на обучающий и тестовый наборы
max_date = interactions['datetime'].max()
train = interactions[interactions['datetime'] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions['datetime'] >= max_date - pd.Timedelta(days=7)].copy()

# Фильтрация "холодных" пользователей
cold_users = test[~test[Columns.User].isin(train[Columns.User])]['user_id'].unique()
test = test[~test['user_id'].isin(cold_users)]

In [39]:
# user features
users.fillna('Unknown', inplace=True)
users = users[users[Columns.User].isin(train[Columns.User])].copy()
user_features = pd.melt(users, id_vars=Columns.User, value_vars=["sex", "age", "income"], var_name='feature', value_name='value')

# item features
items = items[items[Columns.Item].isin(train[Columns.Item])].copy()
items.nunique()

item_id         15565
content_type        2
title           14937
title_orig      10377
release_year      105
genres           2720
countries         676
for_kids            2
age_rating          6
studios            38
directors        7809
actors          12671
description     15225
keywords        15123
dtype: int64

In [40]:
genre_feature = items.assign(genre=items['genres'].str.lower().str.replace(", ", ",", regex=False).str.split(",")).explode('genre')[['item_id', 'genre']]
genre_feature.columns = ['id', 'value']
genre_feature['feature'] = 'genre'

content_feature = items[[Columns.Item, 'content_type']].copy()
content_feature.columns = ['id', 'value']
content_feature['feature'] = 'content_type'

item_features = pd.concat([genre_feature, content_feature])
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# LightFM 

In [41]:
from rectools.model_selection import TimeRangeSplitter
from rectools.model_selection.cross_validate import cross_validate
from rectools.metrics import MeanInvUserFreq, Serendipity

In [42]:
K = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 32
# For Lightfm train
N_EPOCHS = 1 
USER_ALPHA = 0 
ITEM_ALPHA = 0 
LEARNING_RATE = 0.05 

In [43]:
n_splits = 3

splitter = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True, 
)
dataset = Dataset.construct(
    interactions_df=interactions,  
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [44]:
models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    "lightfm": LightFMWrapperModel(
            LightFM(
                no_components=N_FACTORS, 
                loss='warp', 
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            )),
}

metrics_name = {
    'MAP': MAP,
    "novelty": MeanInvUserFreq,
    "serendipity": Serendipity,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    metrics[f'{metric_name}@{K}'] = metric(k=K)

In [45]:
cv_results = cross_validate(
    dataset=dataset,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=K,
    filter_viewed=True,
)

In [46]:
pd.DataFrame(cv_results["metrics"])

Unnamed: 0,model,i_split,MAP@10,novelty@10,serendipity@10
0,random,0,0.000183,15.504172,6e-06
1,popular,0,0.113264,3.753619,3e-06
2,lightfm,0,0.112527,4.238316,2.5e-05
3,random,1,0.000149,15.56027,7e-06
4,popular,1,0.096352,3.711794,3e-06
5,lightfm,1,0.096972,4.206342,2.8e-05
6,random,2,0.000163,15.609173,7e-06
7,popular,2,0.086029,3.703142,3e-06
8,lightfm,2,0.089114,4.210284,3.1e-05


In [47]:
pivot_results = (
    pd.DataFrame(cv_results["metrics"]).drop(columns="i_split")
    .groupby(["model"], sort=False).agg(["mean", "std"])
)
mean_metric_subset = [(metric, "mean") for metric in pivot_results.columns.levels[0]]
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='coral', axis=0)
    .highlight_max(subset=mean_metric_subset, color='green', axis=0)
)

Unnamed: 0_level_0,MAP@10,MAP@10,novelty@10,novelty@10,serendipity@10,serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
random,0.000165,1.7e-05,15.557872,0.052541,7e-06,1e-06
popular,0.098549,0.01375,3.722852,0.026994,3e-06,0.0
lightfm,0.099537,0.011916,4.218314,0.017434,2.8e-05,3e-06


lightfm по map совсем немного выше чем у popular модели, зато novelty у него лучше.  
К тому же serendipity наилучший среди всех моделей, т.к. и map и novelty имеют хорошие значения.

## Train

In [48]:
models = {}
loss = 'warp'

models[f"LightFM_{loss}_{N_FACTORS}"] = LightFMWrapperModel(
        LightFM(
            no_components=N_FACTORS, 
            loss=loss, 
            random_state=RANDOM_STATE,
            learning_rate=LEARNING_RATE,
            user_alpha=USER_ALPHA,
            item_alpha=ITEM_ALPHA,
        ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS)

In [49]:
metrics_name = {'MAP': MAP}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [50]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"])

In [51]:
splitter.get_test_fold_borders(dataset.interactions)

[(Timestamp('2021-07-04 00:00:00', freq='14D'),
  Timestamp('2021-07-18 00:00:00', freq='14D')),
 (Timestamp('2021-07-18 00:00:00', freq='14D'),
  Timestamp('2021-08-01 00:00:00', freq='14D')),
 (Timestamp('2021-08-01 00:00:00', freq='14D'),
  Timestamp('2021-08-15 00:00:00', freq='14D'))]

In [52]:
123 in dataset.user_id_map.external_ids

True

In [53]:
TEST_USERS = test[Columns.User].unique()

In [54]:
models

{'LightFM_warp_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7473fd917bb0>}

In [55]:
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K,
        filter_viewed=True)
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model LightFM_warp_32...


In [56]:
model.recommend(
    users=[1],
    dataset=dataset,
    k=10,
    filter_viewed=False)

Unnamed: 0,user_id,item_id,score,rank
0,1,15297,-234.298111,1
1,1,10440,-234.300827,2
2,1,9728,-234.519989,3
3,1,4151,-234.621445,4
4,1,13865,-234.641357,5
5,1,3734,-234.711838,6
6,1,2657,-234.854706,7
7,1,4880,-235.121262,8
8,1,7571,-235.132324,9
9,1,142,-235.146667,10


In [57]:
metric_values = calc_metrics(metrics, recos, test, train)["MAP@10"]
metric_values

0.08122910925562059

## Tuning

In [58]:
import logging
import optuna
logging.basicConfig(filename="optuna.log",
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG
                    )

In [59]:
def objective(trial):
    param = {
        "no_components": trial.suggest_int("no_components", 4, 64),
        "loss": trial.suggest_categorical("loss", ['logistic', 'bpr', 'warp']),
        "random_state": RANDOM_STATE,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "user_alpha": trial.suggest_float("user_alpha", 0, 0.3),
        "item_alpha": trial.suggest_float("item_alpha", 0, 0.3),
    }
    model = LightFMWrapperModel(
        LightFM(
            **param,
        ),
        epochs=N_EPOCHS,
        num_threads=NUM_THREADS,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    metric_value = calc_metrics({"MAP@10": MAP(k=10)}, recos, test, train)["MAP@10"]
    logging.info(f"{metric_value=}: {param=}")
    return metric_value

In [60]:
study = optuna.create_study(direction="maximize")
study.enqueue_trial(
    {'no_components': 61, 'loss': 'warp', 'random_state': 42, 'learning_rate': 0.22958684083647646, 'user_alpha': 0.10227312715865738, 'item_alpha': 0.049429867705118334}
)
study.optimize(objective, n_trials=10, show_progress_bar=True, n_jobs=-1)
logging.info(f"Number of finished trials: {len(study.trials)}")

logging.info("Best trial:")
trial = study.best_trial

logging.info(f"  Value: {trial.value}")

logging.info("  Params: ")
for key, value in trial.params.items():
    logging.info(f"    {key}: {value}")

[I 2024-03-12 21:48:48,303] A new study created in memory with name: no-name-dbf4f94d-bbfe-4a47-ae01-25d24406cb4e
Best trial: 4. Best value: 0.0787852:  10%|█         | 1/10 [01:10<10:32, 70.29s/it]

[I 2024-03-12 21:49:58,594] Trial 4 finished with value: 0.07878523986540956 and parameters: {'no_components': 22, 'loss': 'warp', 'learning_rate': 0.06017342156239966, 'user_alpha': 0.1470177880800501, 'item_alpha': 0.274150906686091}. Best is trial 4 with value: 0.07878523986540956.


Best trial: 4. Best value: 0.0787852:  20%|██        | 2/10 [01:37<05:57, 44.70s/it]

[I 2024-03-12 21:50:25,294] Trial 9 finished with value: 0.07481687128599925 and parameters: {'no_components': 29, 'loss': 'warp', 'learning_rate': 0.06665908642766347, 'user_alpha': 0.11803659202804068, 'item_alpha': 0.2931751415673471}. Best is trial 4 with value: 0.07878523986540956.


Best trial: 4. Best value: 0.0787852:  30%|███       | 3/10 [01:52<03:37, 31.12s/it]

[I 2024-03-12 21:50:40,343] Trial 6 finished with value: 0.0002245066441358701 and parameters: {'no_components': 6, 'loss': 'logistic', 'learning_rate': 0.03646279349276402, 'user_alpha': 0.19406057542844107, 'item_alpha': 0.18584404575328253}. Best is trial 4 with value: 0.07878523986540956.


Best trial: 4. Best value: 0.0787852:  40%|████      | 4/10 [02:21<03:01, 30.32s/it]

[I 2024-03-12 21:51:09,440] Trial 2 finished with value: 1.3829078125993964e-06 and parameters: {'no_components': 22, 'loss': 'bpr', 'learning_rate': 0.05897978470425043, 'user_alpha': 0.007998804570789064, 'item_alpha': 0.1443879844380104}. Best is trial 4 with value: 0.07878523986540956.


Best trial: 4. Best value: 0.0787852:  50%|█████     | 5/10 [03:06<02:59, 35.86s/it]

[I 2024-03-12 21:51:55,128] Trial 3 finished with value: 2.3493552067445346e-06 and parameters: {'no_components': 23, 'loss': 'bpr', 'learning_rate': 0.06570528865781643, 'user_alpha': 0.10354102235047499, 'item_alpha': 0.2766366073420747}. Best is trial 4 with value: 0.07878523986540956.
[I 2024-03-12 21:51:55,163] Trial 0 finished with value: 0.07514689658500993 and parameters: {'no_components': 61, 'loss': 'warp', 'learning_rate': 0.22958684083647646, 'user_alpha': 0.10227312715865738, 'item_alpha': 0.049429867705118334}. Best is trial 4 with value: 0.07878523986540956.


Best trial: 4. Best value: 0.0787852:  70%|███████   | 7/10 [03:40<01:18, 26.23s/it]

[I 2024-03-12 21:52:28,870] Trial 1 finished with value: 2.1467997471781108e-07 and parameters: {'no_components': 38, 'loss': 'bpr', 'learning_rate': 0.14773169202255307, 'user_alpha': 0.02219021723539999, 'item_alpha': 0.15719779404003423}. Best is trial 4 with value: 0.07878523986540956.


Best trial: 4. Best value: 0.0787852:  80%|████████  | 8/10 [04:46<01:13, 36.69s/it]

[I 2024-03-12 21:53:34,436] Trial 7 finished with value: 6.752012760766325e-05 and parameters: {'no_components': 22, 'loss': 'logistic', 'learning_rate': 0.13453675724016892, 'user_alpha': 0.1883213460591968, 'item_alpha': 0.02741704246911152}. Best is trial 4 with value: 0.07878523986540956.


Best trial: 4. Best value: 0.0787852: 100%|██████████| 10/10 [04:46<00:00, 28.64s/it]

[I 2024-03-12 21:53:34,664] Trial 5 finished with value: 0.0001226123978619921 and parameters: {'no_components': 28, 'loss': 'logistic', 'learning_rate': 0.1826603294074766, 'user_alpha': 0.05986678062431716, 'item_alpha': 0.20249366971597252}. Best is trial 4 with value: 0.07878523986540956.
[I 2024-03-12 21:53:34,673] Trial 8 finished with value: 6.447312489218796e-05 and parameters: {'no_components': 31, 'loss': 'logistic', 'learning_rate': 0.11247531813761556, 'user_alpha': 0.21370803776756472, 'item_alpha': 0.14496522950252635}. Best is trial 4 with value: 0.07878523986540956.





In [61]:
trial.value

0.07878523986540956

In [62]:
trial.params

{'no_components': 22,
 'loss': 'warp',
 'learning_rate': 0.06017342156239966,
 'user_alpha': 0.1470177880800501,
 'item_alpha': 0.274150906686091}

In [63]:
params = trial.params
params['random_state'] = RANDOM_STATE

In [64]:
tuned_model = LightFMWrapperModel(
    LightFM(
        **params
    ),
    epochs=N_EPOCHS,
    num_threads=NUM_THREADS,
)

In [65]:
tuned_model.fit(dataset)
recos = tuned_model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K,
    filter_viewed=True,
)
metric_values = calc_metrics(metrics, recos, test, train)

In [66]:
metric_values

{'MAP@1': 0.04088702956835708,
 'MAP@2': 0.055101776969101325,
 'MAP@3': 0.0641351516300865,
 'MAP@4': 0.07138785197404317,
 'MAP@5': 0.07427079859856509,
 'MAP@6': 0.07629739205944286,
 'MAP@7': 0.0775346985629397,
 'MAP@8': 0.07867103678951617,
 'MAP@9': 0.0798433563099741,
 'MAP@10': 0.08094489383062142}

In [67]:
pd.DataFrame(metric_values.items(), columns=['Metric', 'Value'])

Unnamed: 0,Metric,Value
0,MAP@1,0.040887
1,MAP@2,0.055102
2,MAP@3,0.064135
3,MAP@4,0.071388
4,MAP@5,0.074271
5,MAP@6,0.076297
6,MAP@7,0.077535
7,MAP@8,0.078671
8,MAP@9,0.079843
9,MAP@10,0.080945


# ANN

## nmslib

In [68]:
import nmslib

In [69]:
model = tuned_model
user_embeddings, item_embeddings = model.get_vectors(dataset)
item_embeddings.shape

(15565, 24)

In [70]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [71]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (15565, 24)


(15565, 25)

In [72]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(896791, 25)

In [73]:
M = 48
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [74]:
K=10
space_name='negdotprod'
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}


In [75]:
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [76]:
augmented_user_embeddings.shape

(896791, 25)

In [77]:
query_matrix = augmented_user_embeddings[:1000, :]

In [78]:
query_matrix.shape

(1000, 25)

In [79]:
user_id = 10973
one_user_matrix = augmented_user_embeddings[user_id, :]


In [80]:
nbrs = index.knnQueryBatch([one_user_matrix], k = K, num_threads = num_threads)

In [81]:
nbrs

[(array([ 44,  23,  15,  30, 162,  78, 350, 113, 132,  86], dtype=int32),
  array([0.00651326, 0.0065137 , 0.00651461, 0.00651531, 0.00651575,
         0.00651682, 0.00651693, 0.0065172 , 0.00651746, 0.00651751],
        dtype=float32))]

## faiss

In [82]:
import faiss
import numpy as np

efC = 200  # Размер приоритетной очереди для "обучения", "создания пространства"
efS = 200  # Размер приоритетной очереди для поиска ближайших элементов
M = 48

num_threads = 4

# Создаем FAISS index типа HNSW на 48 разбиений
index = faiss.index_factory(augmented_user_embeddings.shape[1], f"HNSW{M}", faiss.METRIC_L2)

# Добавляем эмбединги объектов фильмов
index.add(augmented_item_embeddings)

# Устанавливаем параметры efC и efS
index.hnsw.efConstruction = efC
index.hnsw.efSearch = efS

In [83]:
D, I = index.search(np.array([one_user_matrix]), K)


In [84]:
print("Indices of Nearest Neighbors:", I)
print("Distances to Nearest Neighbors:", D)

Indices of Nearest Neighbors: [[ 44  23  15  30 162  78 350 113 132  86]]
Distances to Nearest Neighbors: [[2.0130692 2.01307   2.0130718 2.0130732 2.0130742 2.0130763 2.0130763
  2.013077  2.0130775 2.0130777]]


In [85]:
import dill

path_model = '../models/model_lightfm.dill'
with open(path_model, 'wb') as f:
    dill.dump(model, f)