# Обучение lightfm

In [1]:
!pip -q install rectools==0.4.2
!pip -q install lightfm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.5/102.5 kB[0m [31m891.0 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone


In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import pickle
import typing as tp
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    NDCG,
    MeanInvUserFreq,
    Precision,
    Recall,
    Serendipity,
    calc_metrics,
)
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.models import LightFMWrapperModel, PopularModel
from tqdm import tqdm
import requests

# Загружаем данные

In [None]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:  98%|█████████▊| 77.6M/78.8M [00:01<00:00, 130MiB/s]

In [None]:
!unzip kion_train.zip -x '__MACOSX/*'

Archive:  kion_train.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: data_original/users.csv  
  inflating: data_original/items.csv  


In [None]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [None]:
Columns.Datetime = "last_watch_dt"
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format="%Y-%m-%d")
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [None]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

train.drop(train.query("total_dur < 300").index, inplace=True)

cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

TEST_USERS = test[Columns.User].unique()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (3832711, 6)
test: (333026, 6)


# Готовим фичи

In [None]:
def get_user_features(users: pd.DataFrame, interactions: pd.DataFrame, features: tp.List[str]):
    users.fillna("Unknown", inplace=True)
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [None]:
user_features = get_user_features(users, train, ["sex", "age", "income"])

In [None]:
def get_item_features(items: pd.DataFrame, interactions: pd.DataFrame):
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    item_features = pd.concat((genre_feature, content_feature))
    return item_features

In [None]:
item_features = get_item_features(items, train)

# Формируем датасет и делаем разбиение как в ноутбуке с семинара

In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [None]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")
print(f"Продолжительность: {max_date - min_date}")

ranker_days_count = 30

interactions = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [None]:
user_features = get_user_features(users, interactions, ["sex", "age", "income"])
item_features = get_item_features(items, interactions)

Downloading the kion dataset...: 100%|██████████| 78.8M/78.8M [00:20<00:00, 130MiB/s]

In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [None]:
model = LightFMWrapperModel(
    LightFM(
        no_components=8,
        loss="warp",
        random_state=42,
        learning_rate=0.05,
        user_alpha=0.3,
        item_alpha=0.2,
    ),
    epochs=1,
    num_threads=2,
)

In [None]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7ab67f2331f0>

In [None]:
top_N = 100
candidates = model.recommend(dataset.user_id_map.external_ids, dataset, top_N, True)

In [None]:
candidates = candidates.rename({"rank": "lfm_rank", "score": "lfm_score"}, axis=1)

In [None]:
candidates

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank
0,176549,10440,0.001207,1
1,176549,2150,0.001033,2
2,176549,12138,0.001013,3
3,176549,4240,0.000903,4
4,176549,10843,0.000795,5
...,...,...,...,...
72087495,805174,9714,0.000305,96
72087496,805174,3841,0.000302,97
72087497,805174,10958,0.000301,98
72087498,805174,8639,0.000300,99


Сохраняем

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
candidates.to_csv("/content/drive/MyDrive/candidates_lfm_feats.csv", index=False)

In [None]:
MODEL_PATH = "/content/LightFM_warp_8.pkl"
pickle.dump(model, open(MODEL_PATH, "wb"))