In [92]:
%pip install -U implicit seaborn==0.12.0

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [93]:
import json
from pathlib import Path
from typing import Tuple, Sequence, Set

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix

In [94]:
data_path = Path('./neymark-ml-recsys')

ratings = pd.read_csv(data_path / 'train_ratings.csv')
trainsactions = pd.read_csv(data_path / 'train_transactions.csv')
bookmarks = pd.read_csv(data_path / 'train_bookmarks.csv')

with open(data_path / 'catalogue.json', 'r') as f:
    meta_raw = json.load(f)

In [95]:
bookmarks["bookmark"] = 1

In [96]:
trainsactions = trainsactions.merge(bookmarks[[
        'element_uid', 
        'user_uid',
        'bookmark',
    ]], on=['element_uid', 'user_uid'], how='left')

In [97]:
trainsactions.fillna(value={"bookmark":0}, inplace=True)

In [98]:
trainsactions['split_rank_per_user'] = (
    trainsactions
    .groupby('user_uid')['ts']
    .rank('first', ascending=False)
    .astype('int32')
)

In [99]:
def meta_to_df(meta_raw):
    element_uid = []
    duration = []
    type_ = []
    attributes = []
    availability = []
    feature_1 = []
    feature_2 = []
    feature_3 = []
    feature_4 = []
    feature_5 = []
    for k, v in meta_raw.items():
        element_uid.append(int(k))
        duration.append(float(v['duration']) * 60)
        type_.append(v['type'])
        attributes.append(v["attributes"])
        availability.append(v["availability"])
        feature_1.append(v["feature_1"])
        feature_2.append(v["feature_2"])
        feature_3.append(v["feature_3"])
        feature_4.append(v["feature_4"])
        feature_5.append(v["feature_5"])
    meta = pd.DataFrame({
        'element_uid': element_uid,
        'duration': duration,
        'type': type_,
        # "attributes": attributes,
        # "availability": attributes,
        "feature_1": feature_1,
        "feature_2": feature_2,
        "feature_3": feature_3,
        "feature_4": feature_4,
        "feature_5": feature_5,
    })
    return meta

meta = meta_to_df(meta_raw)

In [100]:
def feature_normilize(feature:pd.Series) -> pd.Series:
    return (feature - feature.mean())/feature.std()

In [101]:
trainsactions_with_meta = (
    trainsactions
    .merge(meta[[
        'element_uid',
        'duration',
        'type',
        "feature_1",
        "feature_2",
        "feature_3",
        "feature_4",
        "feature_5",
    ]], on='element_uid', how='left')
)

trainsactions_with_meta["feature_1"] = feature_normilize(
    trainsactions_with_meta["feature_1"])
trainsactions_with_meta["feature_1"] = feature_normilize(
    trainsactions_with_meta["feature_2"])
trainsactions_with_meta["feature_3"] = feature_normilize(
    trainsactions_with_meta["feature_3"])
trainsactions_with_meta["feature_4"] = feature_normilize(
    trainsactions_with_meta["feature_4"])
trainsactions_with_meta["feature_5"] = feature_normilize(
    trainsactions_with_meta["feature_5"])

trainsactions_with_meta["main_feature"] = trainsactions_with_meta[["feature_1", "feature_2", "feature_3", "feature_4", "feature_5"]].mean(axis=1).round()


trainsactions_with_meta['watched_ratio'] = (
    trainsactions_with_meta['watched_time'] /
    trainsactions_with_meta['duration']
)


def score_transaction(t):
    score = 1
    if t['duration'] > 0:
        if t['type'] == 'movie':
            if t['watched_ratio'] > 0.3:
                score = int(t['watched_ratio'] * 9) + 1
        else:
            if t['watched_ratio'] > 1:
                score = int(t['watched_ratio'] / 2) + 1
    if t['bookmark'] > 0 and t['watched_ratio'] <= 0.3:
        score += 10
    return min(10, max(int(score+t["main_feature"]), 1))
    # return min(10, score)


trainsactions_with_meta['score'] = trainsactions_with_meta.apply(
    score_transaction, axis=1)

In [102]:
trainsactions_with_meta_and_ratings = (
    trainsactions_with_meta
    .merge(
        ratings[['user_uid', 'element_uid', 'rating']],
        on=['user_uid', 'element_uid'],
        how='outer'
    )
)

trainsactions_with_meta_and_ratings['score'].fillna(
    trainsactions_with_meta_and_ratings['rating'], inplace=True)

In [103]:
trainsactions_with_meta_and_ratings.drop(index=trainsactions_with_meta_and_ratings[trainsactions_with_meta_and_ratings["duration"].isna()].index, inplace=True)

In [104]:
from sklearn.preprocessing import LabelEncoder

def encode_tfidf_coo(transactions: pd.DataFrame) -> pd.DataFrame:
    score_sum_per_user = (
        transactions
        .groupby('user_uid')['score']
        .transform('sum')
    )
    user_count_per_element = (
        transactions
        .groupby('element_uid')['user_uid']
        .transform('size')
    )
    tf = transactions['score'].values / score_sum_per_user.values
    idf = np.log(len(score_sum_per_user) / user_count_per_element.values)
    
    tfidf = transactions[['user_uid', 'element_uid']].copy()
    tfidf['value'] = tf * idf

    return tfidf


def encode_tfidf(transactions: pd.DataFrame) -> Tuple[LabelEncoder, LabelEncoder, csr_matrix]:
    tfidf = encode_tfidf_coo(transactions)
    
    n_users = tfidf['user_uid'].nunique()
    n_elements = tfidf['element_uid'].nunique()

    user_encoder = LabelEncoder()
    element_encoder = LabelEncoder()
    user_index = user_encoder.fit_transform(transactions['user_uid'].values)
    element_index = element_encoder.fit_transform(transactions['element_uid'].values)

    tfidf_csr = csr_matrix(
        (
            tfidf['value'].astype('float32').values,
            (user_index, element_index)
        ),
        shape=(n_users, n_elements)
    )

    return user_encoder, element_encoder, tfidf_csr


user_encoder, element_encoder, transactions_csr = \
    encode_tfidf(trainsactions_with_meta_and_ratings)

In [105]:
from implicit.als import AlternatingLeastSquares

als = AlternatingLeastSquares(factors=128, iterations=100, alpha=40.0, calculate_training_loss=True)
als.fit(transactions_csr)

100%|██████████| 100/100 [20:57<00:00, 12.58s/it, loss=0.0104]


In [106]:
recommendations_matrix, recommendations_scores = als.recommend(
    np.arange(0, transactions_csr.shape[0]), 
    transactions_csr, 
    N=10, 
    filter_already_liked_items=True
)

In [107]:
def als_recommendations_to_df(
    recommendations_matrix: np.ndarray, 
    recommendations_scores: np.ndarray,
    user_encoder: LabelEncoder, 
    item_encoder: LabelEncoder,
    user_key = 'user_id',
    item_key = 'item_id'
) -> pd.DataFrame:
    recommendations_indices = pd.DataFrame({
        'user_index': np.arange(0, len(recommendations_matrix)),
        'item_index': list(recommendations_matrix),
        'score': list(recommendations_scores),
    })

    user_mapping = pd.DataFrame({
        'user_index': np.arange(0, len(user_encoder.classes_)),
        user_key: user_encoder.classes_,
    })

    item_mapping = pd.DataFrame({
        'item_index': np.arange(0, len(item_encoder.classes_)),
        item_key: item_encoder.classes_,
    })

    recommendations = (
        recommendations_indices
        .merge(
            user_mapping,
            on='user_index',
            how='left',
        )
        .drop(columns=['user_index'])
        .explode(['item_index', 'score'], ignore_index=True)
        .merge(
            item_mapping,
            on='item_index',
            how='left',
        )
        .drop(columns=['item_index'])
    )

    return recommendations


recommendations = als_recommendations_to_df(
    recommendations_matrix,
    recommendations_scores,
    user_encoder,
    element_encoder,
    user_key='user_uid',
    item_key='element_uid',
)

In [108]:
result_df = recommendations.groupby("user_uid", as_index=False).agg({"element_uid":list}).rename(columns={"element_uid":"recommended_element_uid"})

In [109]:
result_df["recommended_element_uid"] = [' '.join(map(str, l)) for l in result_df["recommended_element_uid"]]

In [110]:
result_df.to_csv("my_recomendations.csv", sep=",", index=False)