In [1]:
%%capture
!pip install rectools lightfm

In [2]:
import os
import threadpoolctl
import warnings
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.models import ImplicitALSWrapperModel, ImplicitItemKNNWrapperModel, EASEModel, DSSMModel
from rectools import Columns
from rectools.dataset import Dataset

from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k,train_test_split
from implicit.cpu.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender,TFIDFRecommender
from rectools.models import ImplicitItemKNNWrapperModel, RandomModel, PopularModel
from implicit.approximate_als import FaissAlternatingLeastSquares
from implicit.cpu.bpr import BayesianPersonalizedRanking
from implicit.als import AlternatingLeastSquares
from rectools.models import LightFMWrapperModel
from lightfm import LightFM
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.model_selection import TimeRangeSplitter, cross_validate, LastNSplitter

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid")

# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x78ac4b8b4490>

## DatasetPrepare

In [3]:
train_data = pd.read_csv('/kaggle/input/bdimo-ds/train-11.csv')
items_data = pd.read_csv('/kaggle/input/bdimo-ds/video.csv')
users_data = pd.read_csv('/kaggle/input/users-bdimo/user.csv')
owners_data = pd.read_csv('/kaggle/input/bdimo-ds/owner.csv')

In [4]:
train_data['interaction_type'] = train_data['interaction_type'].map({'view':1,'like':10})
train_data['timestamp'] = pd.to_datetime(train_data['timestamp'])

In [5]:
train_data = train_data.rename({
    'interaction_type':Columns.Weight,
    'timestamp':Columns.Datetime,
    'user_id':Columns.User,
    'video_id':Columns.Item
},axis=1)

### User Features

In [6]:
users_data['city_id'] = users_data['city_id'].fillna('NaN')
users_data['birth_city_id'] = users_data['birth_city_id'].fillna('NaN')
users_data['create_date'] = pd.to_datetime(users_data['create_date']).apply(lambda x: x.year * 366 + x.dayofyear)
users_data['create_date'] = users_data['create_date'] - users_data['create_date'].min()

user_features_frames = []
for feature in ["gender", "age", "language","create_date"]:
    feature_frame = users_data.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

### Item Features

In [7]:
items_data['upload_timestamp'] -= items_data['upload_timestamp'].min()
items_data = items_data.merge(owners_data,how='left')
items_data['last_active_date'] -= items_data['last_active_date'].min()
items_features_frames = []

for feature in ["duration", "upload_timestamp", "subscribers_count", "last_active_date"]:
    feature_frame = items_data.reindex(columns=['video_id', feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    items_features_frames.append(feature_frame)
items_features = pd.concat(items_features_frames)

### BuidDataset

In [8]:
dataset_full_features = Dataset.construct(
    interactions_df=train_data,
    user_features_df=user_features,
    cat_user_features=["gender", "age", "language"],
    item_features_df=items_features,
    #cat_item_features=["genre", "content_type"],
)

## Model Development

In [9]:
def make_base_model(factors: int, regularization: float, alpha: float, fit_features_together: bool=False):
    return ImplicitALSWrapperModel(
        AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            alpha=alpha,
            random_state=RANDOM_STATE,
            use_gpu=False,
            num_threads = NUM_THREADS,
            iterations=ITERATIONS),
        fit_features_together = fit_features_together,
        )

In [10]:
model = ImplicitItemKNNWrapperModel(model=BM25Recommender(K=20, K1=0.05, B=0.25))
# model = ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=10))
# model = PopularModel()
# model = LightFMWrapperModel(LightFM(no_components=256, loss="bpr"))

In [11]:
model.fit(dataset_full_features)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x78ab54401ae0>

## Predict

In [12]:
recos = model.recommend(
    users=train_data['user_id'].unique(),
    dataset=dataset_full_features,
    k=10,
    filter_viewed=True,
)

In [13]:
recos # https://colab.research.google.com/drive/1WvCzgj0MNUf07bPUCVgdXT8xFF-bmQmx

Unnamed: 0,user_id,item_id,score,rank
0,126492,13009,55957.839844,1
1,126492,6,52052.812500,2
2,126492,89274,50208.113281,3
3,126492,14803,45282.414062,4
4,126492,126229,41504.066406,5
...,...,...,...,...
1528231,146066,2699,25975.398438,6
1528232,146066,81000,25502.791016,7
1528233,146066,228660,24940.341797,8
1528234,146066,225449,24859.593750,9


## CrossVal

In [14]:
splitter = LastNSplitter(#TimeRangeSplitter(
    n=10,
    n_splits=1,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [27]:
# Take few simple models to compare
models = {
    "random": RandomModel(random_state=42),
    "popular": PopularModel(),
    "most_rated": PopularModel(popularity="sum_weight"),
    "tfidf_k=10": ImplicitItemKNNWrapperModel(model=TFIDFRecommender(K=100)),
    "bm25_k=10_k1=0.05_b=0.1": ImplicitItemKNNWrapperModel(model=BM25Recommender(K=100, K1=0.05, B=0.1)),
}
#models = {
#    'dssm': DSSMModel(n_factors=128,max_epochs=5,batch_size=128,trainer_accelerator='gpu')
#}
#models = {
#    'lightfm': LightFMWrapperModel(LightFM(no_components=128,k=5,n=10,loss='warp',learning_rate=0.05,random_state=56))
#}

# We will calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "prec@1": Precision(k=1),
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@10": Serendipity(k=10),
}


In [29]:
cv_results = cross_validate(
    dataset=dataset_full_features,
    splitter=splitter,
    models=models,
    metrics=metrics,
    k=10,
    filter_viewed=True,
)

In [23]:
pd.DataFrame(cv_results["metrics"])

Unnamed: 0,model,i_split,prec@1,prec@10,recall@10,novelty@10,serendipity@10
0,lightfm,0,0.002282,0.000656,0.000674,11.063274,2e-06


In [30]:
pd.DataFrame(cv_results["metrics"])

Unnamed: 0,model,i_split,prec@1,prec@10,recall@10,novelty@10,serendipity@10
0,random,0,2.6e-05,3.5e-05,3.6e-05,16.095535,1.354057e-07
1,popular,0,0.016817,0.021659,0.022467,3.809385,9.793607e-08
2,most_rated,0,0.021879,0.022802,0.023622,4.170235,5.039565e-07
3,tfidf_k=10,0,0.038886,0.030845,0.032318,11.190119,9.515151e-05
4,bm25_k=10_k1=0.05_b=0.1,0,0.053978,0.041253,0.042956,5.003779,4.636136e-05
