In [36]:
import json
from pathlib import Path
from typing import Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from implicit.als import AlternatingLeastSquares

from catboost import (
    CatBoostRegressor, 
    Pool
)
from catboost.metrics import (
    MAE
)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

## Чтение модели

In [37]:
catboost = CatBoostRegressor().load_model('model/catboost.cbm')

## Чтение данных

In [38]:
def read_raw_data(dir_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, dict]:

    ratings = pd.read_csv(Path(dir_path) / 'train_ratings.csv')
    trainsactions = pd.read_csv(Path(dir_path) / 'train_transactions.csv')
    bookmarks = pd.read_csv(Path(dir_path) / 'train_bookmarks.csv')

    with open(Path(dir_path) / 'catalogue.json', 'r') as f:
        meta_data = json.load(f)

    return trainsactions, ratings, bookmarks, meta_data  

# convert meta dict representation to pd.DataFrame with 
# parsing of attribute and availability value list
def create_availability_columns(meta: pd.DataFrame) -> pd.DataFrame:
    for availability in ['purchase', 'rent', 'subscription']:
        bynary_list = []
        for i in range(len(meta)):
            if availability in meta['availability'][i]:
                bynary_list.append(1)
            else:
                bynary_list.append(0)
        meta[availability] = bynary_list
    meta = meta.drop(columns='availability')
    return meta 

def create_attributes_columns(meta: pd.DataFrame) -> pd.DataFrame:
    max_length_attributes = 0
    for i in meta['attributes']:
        if max_length_attributes < len(i):
            max_length_attributes = len(i)
    
    for i in range(1, max_length_attributes + 1):
        attribute_list = []
        for j in meta['attributes']:
            if i <= len(j):
                attribute_list.append(j[i - 1])
            else:
                attribute_list.append(0)
        meta[f'attribute{i}'] = attribute_list
    meta = meta.drop(columns=['attributes'])
    return meta

def meta_to_df(meta: dict) -> pd.DataFrame:
    meta = pd.DataFrame.from_dict(meta).T
    meta = meta.reset_index(names='element_uid')
    meta = create_availability_columns(meta)
    meta = create_attributes_columns(meta)
    return meta 

In [39]:
trainsactions, ratings, bookmarks, meta = read_raw_data(dir_path='data')
meta = meta_to_df(meta)

## Предобработка данных

In [40]:
def consumption_mode_encoder(trainsactions: pd.DataFrame) -> pd.DataFrame:
    encoder = LabelEncoder()
    numeric_consumption_mode = encoder.fit_transform(trainsactions['consumption_mode'].values)
    trainsactions['consumption_mode'] = numeric_consumption_mode.astype(np.int8)
    return trainsactions

def change_trainsaction_dtype(trainsactions: pd.DataFrame) -> pd.DataFrame:
    trainsactions = consumption_mode_encoder(trainsactions)
    trainsactions['ts'] = trainsactions['ts'].astype(np.float32)
    trainsactions['watched_time'] = trainsactions['watched_time'].astype(np.int32)
    trainsactions[['device_type', 'device_manufacturer']] = trainsactions[['device_type', 'device_manufacturer']].astype(np.int8)
    return trainsactions

def change_ratings_dtype(ratings: pd.DataFrame) -> pd.DataFrame:
    ratings['rating'] = ratings['rating'].astype(np.int8)
    ratings['ts'] = ratings['ts'].astype(np.float32)
    return ratings

def meta_type_encoder(meta: pd.DataFrame) -> pd.DataFrame:
    encoder = LabelEncoder()
    numeric_type = encoder.fit_transform(meta['type'].values)
    meta['type'] = numeric_type.astype(np.int8)
    return meta

def change_meta_dtype(meta: pd.DataFrame) -> pd.DataFrame:
    meta['element_uid'] = meta['element_uid'].astype(np.int64)
    meta = meta_type_encoder(meta)
    meta['duration'] = meta['duration'].astype(np.int16)
    meta[['feature_1', 'feature_2', 'feature_4', 'feature_5']] \
         = meta[['feature_1', 'feature_2', 'feature_4', 'feature_5']].astype(np.float32)
    meta['feature_3'] = meta['feature_3'].astype(np.int8)
    meta[['purchase', 'rent', 'subscription']] = meta[['purchase', 'rent', 'subscription']].astype(np.int8)
    meta.loc[:, 'attribute1':] = meta.loc[:, 'attribute1':].astype(np.int32)
    return meta

def preprocess_raw_data(trainsactions: pd.DataFrame, 
                        ratings: pd.DataFrame, 
                        meta: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    trainsactions = change_trainsaction_dtype(trainsactions)
    ratings = change_ratings_dtype(ratings)
    meta = change_meta_dtype(meta)

    return trainsactions, ratings, meta

In [41]:
trainsactions, ratings, meta = preprocess_raw_data(trainsactions, ratings, meta)

  meta.loc[:, 'attribute1':] = meta.loc[:, 'attribute1':].astype(np.int32)


## Создание user-features и item-features

In [42]:
def add_meta(meta: pd.DataFrame, item_features: pd.DataFrame) -> pd.DataFrame:
    item_features = (
        item_features
        .merge(
            meta,
            how='inner',
            on='element_uid'
        )
    )
    return item_features

def add_content_popularity(trainsactions: pd.DataFrame, item_features: pd.DataFrame) -> pd.DataFrame:
    item_features = (
        item_features
        .merge(
            trainsactions.groupby('element_uid').size().reset_index(name='element_occurences'),
            how='left',
            on='element_uid'
        )
    )
    item_features['popularity'] = (
        (item_features['element_occurences'] / item_features['element_uid'].nunique()).astype(np.float32)
    )
    item_features = item_features.drop(columns=['element_occurences'])
    
    return item_features

def add_count_content_bookmark(bookmarks: pd.DataFrame, item_features: pd.DataFrame) -> pd.DataFrame:
    bookmarks_per_item = (
        bookmarks
        .groupby('element_uid')
        .size()
        .reset_index(name='element_bookmark_count')
    )
    item_features = (
        item_features
        .merge(
            bookmarks_per_item[['element_uid', 'element_bookmark_count']],
            on='element_uid',
            how='left'
        )
    )
    item_features['element_bookmark_count'].fillna(0, inplace=True)
    item_features['element_bookmark_count'] = item_features['element_bookmark_count'].astype(np.int32)

    return item_features

def create_item_features(meta: pd.DataFrame, trainsactions: pd.DataFrame, bookmarks: pd.DataFrame) -> pd.DataFrame:
    item_features = pd.DataFrame(trainsactions['element_uid'].unique(), columns=['element_uid'])
    print("add meta")
    item_features = add_meta(meta, item_features)
    print("add content popularity")
    item_features = add_content_popularity(trainsactions, item_features)
    print("add count content bookmark")
    item_features = add_count_content_bookmark(bookmarks, item_features)
    return item_features

def add_favorite_device_type(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame:
    favorite_device_type_per_user = (
        trainsactions
        .groupby(['user_uid', 'device_type'])
        .size()
        .groupby('user_uid')
        .idxmax()
    )
    favorite_device_type_per_user_df = pd.DataFrame([[i[0], i[1]] for i in favorite_device_type_per_user], 
                                                    columns=['user_uid', 'favorite_device_type'])
    user_features = (
        user_features
        .merge(
            favorite_device_type_per_user_df,
            how='left',
            on='user_uid'
        )
    )

    return user_features

def add_favorite_consumption_mode(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame:
    favorite_consumption_mode_per_user = (
        trainsactions
        .groupby(['user_uid', 'consumption_mode'])
        .size()
        .groupby('user_uid')
        .idxmax()
    )
    favorite_consumption_mode_per_user_df = pd.DataFrame([[i[0], i[1]] for i in favorite_consumption_mode_per_user], 
                                                        columns=['user_uid', 'favorite_consumption_mode'])
    
    user_features = (
        user_features
        .merge(
            favorite_consumption_mode_per_user_df,
            how='left',
            on='user_uid'
        )
    )

    return user_features

def add_amount_watched_item(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame:
    user_features = (
        user_features
        .merge(
            trainsactions.groupby('user_uid').size().reset_index(name='user_watch_count'),
            on='user_uid',
            how='left'
        )
    )

    return user_features

def add_mean_watched_time(trainsactions: pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame: 
    user_watch_time_mean = (
        trainsactions
        .groupby('user_uid')['watched_time']
        .mean()
        .reset_index(name='user_watch_time_mean')
    )

    user_features = (
        user_features
        .merge(
            user_watch_time_mean,
            on='user_uid',
            how='left'
        )
    )

    return user_features

def change_dtype(user_features: pd.DataFrame) -> pd.DataFrame:
    user_features[['favorite_device_type', 'favorite_consumption_mode']] = \
        user_features[['favorite_device_type', 'favorite_consumption_mode']].astype(np.int8)
    user_features['user_watch_count'] = user_features['user_watch_count'].astype(np.int32)
    user_features['user_watch_time_mean'] = user_features['user_watch_time_mean'].astype(np.float32)

    return user_features
    

def create_user_features(trainsactions: pd.DataFrame) -> pd.DataFrame:
    user_features = pd.DataFrame(trainsactions['user_uid'].unique(), columns=['user_uid'])
    print("add favorite device type")
    user_features = add_favorite_device_type(trainsactions, user_features)
    print("add favorite consumption mode")
    user_features = add_favorite_consumption_mode(trainsactions, user_features)
    print("add amount watched item")
    user_features = add_amount_watched_item(trainsactions, user_features)
    print("add mean watched time")
    user_features = add_mean_watched_time(trainsactions, user_features)
    print("change dtype")
    user_features = change_dtype(user_features)

    return user_features

def create_item_user_features(trainsactions: pd.DataFrame, meta: pd.DataFrame, bookmarks: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    user_features = create_user_features(trainsactions)
    item_features = create_item_features(meta, trainsactions, bookmarks)

    return user_features, item_features

In [43]:
user_features, item_features = create_item_user_features(trainsactions, meta, bookmarks)

add favorite device type
add favorite consumption mode
add amount watched item
add mean watched time
change dtype
add meta
add content popularity
add count content bookmark


## Обучение ALS

In [44]:
def score_transaction(t):
    score = 1
    if t['duration'] > 0:
        if t['type'] == 0:
            if t['watched_ratio'] > 0.3:
                score = int(t['watched_ratio'] * 9) + 1
        else:
            if t['watched_ratio'] > 1:
                score = int(t['watched_ratio'] / 2) + 1
    return min(score, 10)

def trainsactions_ratings_approximation(trainsactions: pd.DataFrame, meta: pd.DataFrame) -> pd.DataFrame:
    trainsactions_with_meta = (
        trainsactions
        .merge(meta[[
            'element_uid', 
            'duration',
            'type',
        ]], on='element_uid', how='left')
    )
    trainsactions_with_meta['watched_ratio'] = (
        trainsactions_with_meta['watched_time'] / 
        trainsactions_with_meta['duration']
    )
    print("score trainsaction")
    trainsactions_with_meta['score'] = trainsactions_with_meta.apply(score_transaction, axis=1)
    
    return trainsactions_with_meta    

def score_matrix(als_train_score: pd.DataFrame) -> pd.DataFrame:
    return als_train_score[['element_uid', 'user_uid', 'score']]

def merge_trainsactions_ratings(trainsactions_ratings: pd.DataFrame, ratings: pd.DataFrame) -> pd.DataFrame:
    trainsactions_ratings = (
        trainsactions_ratings
        .merge(
            ratings[['user_uid', 'element_uid', 'rating']],
            on=['user_uid', 'element_uid'],
            how='outer'
        )
    )

    trainsactions_ratings['score'].fillna(
        trainsactions_ratings['rating'], inplace=True)
    
    return trainsactions_ratings

def change_als_train_dtype(als_train: pd.DataFrame) -> pd.DataFrame:
    als_train['score'] = als_train['score'].astype(np.int8)
    als_train.drop(columns='rating', inplace=True)
    return als_train

def encode_tfidf_coo(als_train: pd.DataFrame) -> pd.DataFrame:
    score_sum_per_user = (
        als_train
        .groupby('user_uid')['score']
        .transform('sum')
    )
    user_count_per_element = (
        als_train
        .groupby('element_uid')['user_uid']
        .transform('size')
    )
    tf = als_train['score'].values / score_sum_per_user.values
    idf = np.log(len(score_sum_per_user) / user_count_per_element.values)
    
    tfidf = als_train[['user_uid', 'element_uid']].copy()
    tfidf['value'] = tf * idf

    return tfidf

def encode_tfidf(als_train: pd.DataFrame) -> Tuple[LabelEncoder, LabelEncoder, csr_matrix]:
    tfidf = encode_tfidf_coo(als_train)
    
    n_users = tfidf['user_uid'].nunique()
    n_elements = tfidf['element_uid'].nunique()

    user_encoder = LabelEncoder()
    element_encoder = LabelEncoder()
    user_index = user_encoder.fit_transform(als_train['user_uid'].values)
    element_index = element_encoder.fit_transform(als_train['element_uid'].values)

    tfidf_csr = csr_matrix(
        (
            tfidf['value'].astype('float32').values,
            (user_index, element_index)
        ),
        shape=(n_users, n_elements)
    )

    return user_encoder, element_encoder, tfidf_csr

def als_fit_predict(als_csr: csr_matrix):

    als = AlternatingLeastSquares(factors=128, iterations=30, calculate_training_loss=True)
    als.fit(als_csr)

    recommendations_matrix, recommendations_scores = als.recommend(
        np.arange(0, als_csr.shape[0]), 
        als_csr, 
        N=50, 
        filter_already_liked_items=True
    )

    return recommendations_matrix, recommendations_scores

def als_recommendations_to_df(
    recommendations_matrix: np.ndarray, 
    recommendations_scores: np.ndarray,
    user_encoder: LabelEncoder, 
    item_encoder: LabelEncoder,
    user_key = 'user_id',
    item_key = 'item_id'
) -> pd.DataFrame:
    recommendations_indices = pd.DataFrame({
        'user_index': np.arange(0, len(recommendations_matrix)),
        'item_index': list(recommendations_matrix),
        'score': list(recommendations_scores),
    })

    user_mapping = pd.DataFrame({
        'user_index': np.arange(0, len(user_encoder.classes_)),
        user_key: user_encoder.classes_,
    })

    item_mapping = pd.DataFrame({
        'item_index': np.arange(0, len(item_encoder.classes_)),
        item_key: item_encoder.classes_,
    })

    recommendations = (
        recommendations_indices
        .merge(
            user_mapping,
            on='user_index',
            how='left',
        )
        .drop(columns=['user_index'])
        .explode(['item_index', 'score'], ignore_index=True)
        .merge(
            item_mapping,
            on='item_index',
            how='left',
        )
        .drop(columns=['item_index'])
    )

    return recommendations

def run_als(trainsactions: pd.DataFrame, 
            meta: pd.DataFrame, 
            ratings: pd.DataFrame) -> pd.DataFrame:
    print("preprocess trainsactions")
    trainsactions = trainsactions_ratings_approximation(trainsactions, meta)
    trainsactions = score_matrix(trainsactions)
    trainsactions = merge_trainsactions_ratings(trainsactions, ratings)
    trainsactions = change_als_train_dtype(trainsactions)

    print("run TF-IDF")
    user_encoder, element_encoder, trainsactions_csr = encode_tfidf(trainsactions)

    print("run als")
    recommendations_matrix, recommendations_scores = als_fit_predict(trainsactions_csr)

    print("postprocess als predictions")
    recommendations = als_recommendations_to_df(
        recommendations_matrix,
        recommendations_scores,
        user_encoder,
        element_encoder,
        user_key='user_uid',
        item_key='element_uid',
    )

    return recommendations

In [45]:
als_prediction_full = run_als(trainsactions, meta, ratings)

preprocess trainsactions
score trainsaction
run TF-IDF
run als


  0%|          | 0/30 [00:00<?, ?it/s]

postprocess als predictions


## Инференс CatBoostRegressor

In [46]:
def change_recommendation_dtype(recommendations: pd.DataFrame) -> pd.DataFrame:
    recommendations['score'] = recommendations['score'].astype(np.float32)
    return recommendations
    
def get_important_feature(item_features:pd.DataFrame, user_features: pd.DataFrame) -> Tuple[list, list]:
    important_features = ['attribute26',
                        'attribute11',
                        'user_watch_count',
                        'element_bookmark_count',
                        'user_watch_time_mean',
                        'popularity',
                        'favorite_consumption_mode',
                        'attribute25',
                        'favorite_device_type',
                        'feature_5',
                        'attribute7',
                        'attribute1',
                        'feature_4',
                        'feature_3',
                        'attribute19',
                        'duration',
                        'feature_1',
                        'attribute5',
                        'attribute23']

    item_important_features = list(set(important_features) & set(item_features.columns))
    item_important_features.append('element_uid')

    user_important_feature = list(set(important_features) & set(user_features.columns))
    user_important_feature.append('user_uid')

    return user_important_feature, item_important_features

def merge_als_prediction_item_user_features(als_prediction_full: pd.DataFrame, 
                                            item_features: pd.DataFrame, 
                                            user_features: pd.DataFrame,
                                            user_important_feature: list,
                                            item_important_features: list) -> pd.DataFrame:

    gbt_features = (
        als_prediction_full
        .merge(
            item_features[item_important_features],
            on='element_uid',
            how='left'
        )
        .merge(
            user_features[user_important_feature],
            on='user_uid',
            how='left'
        )
    )
    return gbt_features

def run_catboost(als_predictions: pd.DataFrame, user_features: pd.DataFrame, item_features: pd.DataFrame, catboost: CatBoostRegressor) -> pd.DataFrame:
    print("preprocess")
    als_predictions = change_recommendation_dtype(als_predictions)

    print("important features")
    user_important_feature, item_important_features = get_important_feature(item_features, user_features)

    print("merge prediction, user and item features")
    gbt_features = merge_als_prediction_item_user_features(als_predictions, item_features, user_features, user_important_feature, item_important_features)

    interactions = gbt_features[['user_uid', 'element_uid']].copy()
    gbt_features_X = gbt_features.drop(columns=['score', 'user_uid', 'element_uid'])

    print("catboost scores")
    catboost_scores = catboost.predict(gbt_features_X)
    interactions['catboost_score'] = catboost_scores

    return interactions

In [47]:
catboost_prediction_full = run_catboost(als_prediction_full, user_features, item_features, catboost)

preprocess
important features
merge prediction, user and item features
catboost scores


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [48]:
catboost_prediction_full

Unnamed: 0,user_uid,element_uid,catboost_score
0,0,1650,0.037810
1,0,506,0.021798
2,0,603,0.035030
3,0,3757,0.037561
4,0,5665,0.038164
...,...,...,...
12742445,593489,6769,0.014798
12742446,593489,5612,0.022869
12742447,593489,214,0.012417
12742448,593489,7498,0.018434


In [49]:
final_recommendations = (
    catboost_prediction_full
    .sort_values('catboost_score', ascending=False, ignore_index=True)
    .groupby('user_uid')
    .head(10)
)

In [50]:
final_recommendations

Unnamed: 0,user_uid,element_uid,catboost_score
0,591854,143,0.383180
1,470776,143,0.383180
2,59211,143,0.367527
3,269045,143,0.363574
4,126480,143,0.363574
...,...,...,...
11915693,213478,7176,0.009953
11935463,96478,7176,0.009900
11953117,55046,2567,0.009851
11974384,55046,1612,0.009792


In [51]:
pre_solution = final_recommendations.copy()

In [52]:
pre_solution['element_uid'] = pre_solution['element_uid'].astype('string')

In [53]:
pre_solution = (
    pre_solution
    .groupby('user_uid')
    .agg({'element_uid': lambda x: " ".join(x)})
    .rename({'element_uid': 'recommended_element_uid'}, axis=1)
)

In [54]:
pre_solution

Unnamed: 0_level_0,recommended_element_uid
user_uid,Unnamed: 1_level_1
0,1277 2269 5812 1577 1326 4519 2694 5665 1650 6040
1,2880 4621 944 8919 2570 5266 5798 1349 7119 2694
3,5739 1017 5798 1799 3045 4026 1577 1570 3757 1326
7,6321 8011 3147 460 944 5904 2183 5612 8101 1326
8,3757 5328 702 5798 8888 1326 388 5201 8658 5665
...,...
593477,2855 402 6656 30 1217 944 3753 3329 3757 9992
593478,4621 452 944 8919 8756 2880 3502 3329 9992 6040
593482,2531 6656 6040 1650 5665 5904 10029 8632 3905 ...
593486,1364 2855 71 402 793 7677 8632 452 4978 1217


In [55]:
users_for_submission = pd.read_csv('data/users_for_submission.csv')

In [56]:
users = users_for_submission['user_uid'].values

In [57]:
solutions = pre_solution['recommended_element_uid'][users]

In [58]:
solutions

user_uid
5177       10029 452 813 8919 3757 2070 3502 5798 5328 5580
593316    2531 6656 6040 9788 4473 6321 5739 1577 5665 3757
262355    1364 8944 8101 9657 1577 3905 1326 2694 1017 9835
74296         2531 6656 2855 793 2024 143 8944 402 3911 452
340623      3329 2880 4473 9788 3837 4205 57 4519 4026 3905
                                ...                        
399709     3094 2014 6321 944 6040 3753 3502 3329 2070 7967
495866      7677 1535 6321 3753 4363 4507 9992 944 460 5266
409943     7677 2014 6321 4621 6040 944 8919 5266 2183 5798
348798    3256 7677 6606 10029 9992 4978 3753 236 3329 2276
167104     6321 4363 944 3147 6040 6388 3623 2183 7777 3578
Name: recommended_element_uid, Length: 254849, dtype: string

In [59]:
solutions.to_csv('solution/solution.csv') 