In [None]:
%pip install -U implicit catboost

In [None]:
import json
from pathlib import Path
from typing import Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from implicit.als import AlternatingLeastSquares

from catboost import (
    CatBoostClassifier, 
    Pool
)
from catboost.metrics import (
    BalancedAccuracy, 
    Logloss
)

### Load datasets

In [None]:
data_path = Path('/kaggle/input/neymark-ml-recsys')

def meta_to_df(meta_raw):
    element_uid = []
    duration = []
    type_ = []
    for k, v in meta_raw.items():
        element_uid.append(int(k))
        duration.append(float(v['duration']) * 60)
        type_.append(v['type'])
    meta = pd.DataFrame({
        'element_uid': element_uid,
        'duration': duration,
        'type': type_,
    })
    return meta

ratings = pd.read_csv(data_path / 'train_ratings.csv')
bookmarks = pd.read_csv(data_path / 'train_bookmarks.csv')
transactions = pd.read_csv(data_path / 'train_transactions.csv')

bookmarks["bookmark"] = 1

with open(data_path / 'catalogue.json', 'r') as f:
    meta_raw = json.load(f)
    meta = meta_to_df(meta_raw)

### Functions to train/predict ALS

In [None]:
def add_bookmarks_data(transactions, bookmarks):
    trainsactions_with_bookmarks = transactions.merge(bookmarks[[
        'element_uid', 
        'user_uid',
        'bookmark',
    ]], on=['element_uid', 'user_uid'], how='left')
    trainsactions_with_bookmarks.fillna(value={"bookmark":0}, inplace=True)
    return trainsactions_with_bookmarks

In [None]:
def add_meta_to_transactions(transactions: pd.DataFrame, meta:pd.DataFrame) -> pd.DataFrame:
    transactions_with_meta = (
        transactions
        .merge(meta[[
            'element_uid', 
            'duration',
            'type',
        ]], on='element_uid', how='left')
    )

    transactions_with_meta['watched_ratio'] = (
        transactions_with_meta['watched_time'] / 
        transactions_with_meta['duration']
    )

    def score_transaction(t):
        score = 1
        if t['duration'] > 0:
            if t['type'] == 'movie':
                if t['watched_ratio'] > 0.3:
                    score = int(t['watched_ratio'] * 9) + 1
            else:
                if t['watched_ratio'] > 1:
                    score = int(t['watched_ratio'] / 2) + 1
        if t['bookmark'] > 0 and t['watched_ratio'] <= 0.3:
            score += 10
        return min(score, 10)

    transactions_with_meta['score'] = transactions_with_meta.apply(score_transaction, axis=1)

    return transactions_with_meta


def add_ratings_to_transactions(transactions, ratings):
    transactions_with_ratings = (
        transactions
        .merge(
            ratings[['user_uid', 'element_uid', 'rating']],
            on=['user_uid', 'element_uid'],
            how='outer'
        )
    )

    transactions_with_ratings['score'].fillna(
        transactions_with_ratings['rating'], inplace=True)

    return transactions_with_ratings


def encode_tfidf_coo(transactions: pd.DataFrame) -> pd.DataFrame:
    score_sum_per_user = (
        transactions
        .groupby('user_uid')['score']
        .transform('sum')
    )
    user_count_per_element = (
        transactions
        .groupby('element_uid')['user_uid']
        .transform('size')
    )
    tf = transactions['score'].values / score_sum_per_user.values
    idf = np.log(len(score_sum_per_user) / user_count_per_element.values)
    
    tfidf = transactions[['user_uid', 'element_uid']].copy()
    tfidf['value'] = tf * idf

    return tfidf


def encode_tfidf(transactions: pd.DataFrame) -> Tuple[LabelEncoder, LabelEncoder, csr_matrix]:
    tfidf = encode_tfidf_coo(transactions)
    
    n_users = tfidf['user_uid'].nunique()
    n_elements = tfidf['element_uid'].nunique()

    user_encoder = LabelEncoder()
    element_encoder = LabelEncoder()
    user_index = user_encoder.fit_transform(transactions['user_uid'].values)
    element_index = element_encoder.fit_transform(transactions['element_uid'].values)

    tfidf_csr = csr_matrix(
        (
            tfidf['value'].astype('float32').values,
            (user_index, element_index)
        ),
        shape=(n_users, n_elements)
    )

    return user_encoder, element_encoder, tfidf_csr


def als_fit_predict(transactions_csr: csr_matrix):
    als = AlternatingLeastSquares(factors=128, iterations=100, alpha=40.0, calculate_training_loss=True)
    als.fit(transactions_csr)

    recommendations_matrix, recommendations_scores = als.recommend(
        np.arange(0, transactions_csr.shape[0]), 
        transactions_csr, 
        N=100, 
        filter_already_liked_items=True
    )

    return recommendations_matrix, recommendations_scores


def als_recommendations_to_df(
    recommendations_matrix: np.ndarray, 
    recommendations_scores: np.ndarray,
    user_encoder: LabelEncoder, 
    item_encoder: LabelEncoder,
    user_key = 'user_id',
    item_key = 'item_id'
) -> pd.DataFrame:
    recommendations_indices = pd.DataFrame({
        'user_index': np.arange(0, len(recommendations_matrix)),
        'item_index': list(recommendations_matrix),
        'score': list(recommendations_scores),
    })

    user_mapping = pd.DataFrame({
        'user_index': np.arange(0, len(user_encoder.classes_)),
        user_key: user_encoder.classes_,
    })

    item_mapping = pd.DataFrame({
        'item_index': np.arange(0, len(item_encoder.classes_)),
        item_key: item_encoder.classes_,
    })

    recommendations = (
        recommendations_indices
        .merge(
            user_mapping,
            on='user_index',
            how='left',
        )
        .drop(columns=['user_index'])
        .explode(['item_index', 'score'], ignore_index=True)
        .merge(
            item_mapping,
            on='item_index',
            how='left',
        )
        .drop(columns=['item_index'])
    )

    return recommendations


def run_als(
    transactions: pd.DataFrame, 
    meta: pd.DataFrame, 
    ratings: pd.DataFrame,
    bookmarks: pd.DataFrame
) -> pd.DataFrame:
    print('Preprocess transactions')
    transactions = add_meta_to_transactions(transactions, meta)
    transactions = add_ratings_to_transactions(transactions, ratings)
    transactions = add_bookmarks_data(transactions, bookmarks)
    transactions.drop(index=transactions[transactions["duration"].isna()].index, inplace=True)  

    print('Compute TF-IDF')
    user_encoder, element_encoder, transactions_csr = \
        encode_tfidf(transactions)

    print('Run ALS')
    recommendations_item_indices, recommendations_scores = \
        als_fit_predict(transactions_csr)

    print('Postprocess ALS prediction')
    recommendations = als_recommendations_to_df(
        recommendations_item_indices,
        recommendations_scores,
        user_encoder,
        element_encoder,
        user_key='user_uid',
        item_key='element_uid',
    )

    return recommendations

### Functions to prepare features for CatBoost and train/predict CatBoost 

In [None]:
def merge_item_feature(item_features, new_feature):
    item_features = item_features.merge(
        new_feature,
        on='element_uid',
        how='left'
    )
    return item_features


def merge_user_feature(iser_features, new_feature):
    iser_features = iser_features.merge(
        new_feature,
        on='user_uid',
        how='left'
    )
    return iser_features


def add_item_popularity_feature(item_features: pd.DataFrame, transactions: pd.DataFrame):
    item_occurences = (
        transactions
        .groupby('element_uid')
        .size()
        .reset_index(name='element_occurences')
    )

    item_occurences['element_popularity'] = (
        item_occurences['element_occurences'] / 
        transactions['element_uid'].nunique()
    )
    item_occurences.drop(columns=['element_occurences'], inplace=True)

    return merge_item_feature(item_features, item_occurences)


def add_item_bookmark_count_feature(item_features: pd.DataFrame, bookmarks: pd.DataFrame):
    bookmarks_per_item = (
        bookmarks
        .groupby('element_uid')
        .size()
        .reset_index(name='element_bookmark_count')
    ) 

    item_features = merge_item_feature(item_features, bookmarks_per_item)
    item_features['element_bookmark_count'].fillna(0, inplace=True)

    return item_features


def add_user_watch_count_feature(user_features: pd.DataFrame, transactions: pd.DataFrame):
    user_watch_count = (
        transactions
        .groupby('user_uid')
        .size()
        .reset_index(name='user_watch_count')
    )
    return merge_user_feature(user_features, user_watch_count)


def add_user_watch_time_std(user_features: pd.DataFrame, transactions: pd.DataFrame):
    user_watch_time_std = (
        transactions
        .groupby('user_uid')['watched_time']
        .std()
        .reset_index(name='user_watch_time_std')
    )
    return merge_user_feature(user_features, user_watch_time_std)


def generate_item_features(transactions: pd.DataFrame, bookmarks: pd.DataFrame) -> pd.DataFrame:
    item_features = pd.DataFrame({
        'element_uid': transactions['element_uid'].unique()
    })
    item_features = add_item_popularity_feature(item_features, transactions)
    item_features = add_item_bookmark_count_feature(item_features, bookmarks)
    return item_features


def generate_user_features(transactions: pd.DataFrame) -> pd.DataFrame:
    user_features = pd.DataFrame({
        'user_uid': transactions['user_uid'].unique()
    })
    user_features = add_user_watch_count_feature(user_features, transactions)
    user_features = add_user_watch_time_std(user_features, transactions)
    return user_features


def enrich_interactions(
    interactions: pd.DataFrame, 
    transactions: pd.DataFrame, 
    bookmarks: pd.DataFrame
) -> pd.DataFrame:
    print('Generating item features')
    item_features = generate_item_features(transactions, bookmarks)
    
    print('Generating user features')
    user_features = generate_user_features(transactions)
    
    print('Merging features to interactions')
    interactions_featurized = (
        interactions
        .merge(
            item_features,
            on='element_uid',
            how='left'
        )
        .merge(
            user_features,
            on='user_uid',
            how='left'
        )
    )

    return interactions_featurized


def score_interactions(
    interactions_featurized: pd.DataFrame,
    catboost_model: CatBoostClassifier, 
) -> pd.DataFrame:
    interactions = interactions_featurized[['user_uid', 'element_uid']].copy()
    features = interactions_featurized.drop(columns=['user_uid', 'element_uid'])
    scores = catboost_model.predict_proba(features)[:, 1].flatten()
    interactions['catboost_score'] = scores
    return interactions


def fit_catboost(
    candidates: pd.DataFrame,
    test_transactions: pd.DataFrame,
    transactions: pd.DataFrame, 
    bookmarks: pd.DataFrame,
):
    print('Labeling candidates')
    candidates_labeled = (
        candidates
        .merge(
            (
                test_transactions[['user_uid', 'element_uid']]
                .assign(label=1)
            ),
            on=['user_uid', 'element_uid'],
            how='left',
        )
    )

    candidates_labeled['label'] = (
        candidates_labeled['label']
        .fillna(0)
        .astype('int32')
    )    
    
    positive_classes_rate = candidates_labeled['label'].sum() / len(candidates_labeled)
    print(f'Positive classes = {positive_classes_rate * 100:.2f}%')

    print('Extracting features for candidates')
    cb_features = enrich_interactions(candidates_labeled, transactions, bookmarks)
    del candidates_labeled
    
    cb_pool = Pool(
        cb_features.drop(columns=['user_uid', 'element_uid', 'label']), 
        cb_features['label']
    )
    del cb_features
    
    print('Training CatBoost')
    cb_cls = CatBoostClassifier(
        iterations=200,
        class_weights=[positive_classes_rate, 1 - positive_classes_rate],
        eval_metric=BalancedAccuracy(),
        objective=Logloss()
    )
    
    cb_cls.fit(cb_pool, verbose=1)
    
    return cb_cls
    
def predict_catboost(
    candidates: pd.DataFrame, 
    transactions: pd.DataFrame, 
    bookmarks: pd.DataFrame,
    catboost_model: CatBoostClassifier
) -> pd.DataFrame:
    interactions_featurized = enrich_interactions(candidates, transactions, bookmarks)
    
    print('Running CatBoost scoring')
    scored_interactions = score_interactions(interactions_featurized, catboost_model)

    return scored_interactions

### Train/predict ALS and CatBoost

In [None]:
transactions['split_rank_per_user'] = (
    transactions
    .groupby('user_uid')['ts']
    .rank('first', ascending=False)
    .astype('int32')
)

als_transactions_train = (
    transactions
    .query('split_rank_per_user > 2')
    .drop(columns=['split_rank_per_user'])
)

als_transactions_test = (
    transactions
    .query('split_rank_per_user <= 2')
    .drop(columns=['split_rank_per_user'])
)[["element_uid", "user_uid"]]

In [None]:
als_candidates_to_train_catboost = run_als(als_transactions_train, meta, ratings, bookmarks)

In [None]:
catboost_model = fit_catboost(als_candidates_to_train_catboost, als_transactions_test, transactions, bookmarks)

In [None]:
del als_transactions_train
del als_transactions_test
del als_candidates_to_train_catboost

als_candidates = run_als(transactions, meta, ratings, bookmarks)
catboost_prediction_full = predict_catboost(als_candidates, transactions, bookmarks, catboost_model)

In [None]:
final_recommendations = (
    catboost_prediction_full
    .sort_values('catboost_score', ascending=False, ignore_index=True)
    .groupby('user_uid')
    .head(10)
)
del catboost_prediction_full

In [None]:
final_recommendations

In [None]:
result_df = final_recommendations.groupby("user_uid", as_index=False).agg({"element_uid":list}).rename(columns={"element_uid":"recommended_element_uid"})

In [None]:
result_df["recommended_element_uid"] = [' '.join(map(str, l)) for l in result_df["recommended_element_uid"]]

In [None]:
result_df

In [None]:
result_df.to_csv("my_recomendations.csv", sep=",", index=False)