In [1]:
from typing import Tuple
from pathlib import Path
import pandas as pd
import numpy as np

# Обучение CatBoostRegressor

## Предобработка данных

In [2]:
def read_data(dir_path: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    item_features = pd.read_parquet(Path(dir_path) / 'item_features.parquet')
    user_features = pd.read_parquet(Path(dir_path) / 'user_features.parquet')
    recommendations = pd.read_parquet(Path(dir_path) / 'recommendations.parquet')

    return item_features, user_features, recommendations 

In [3]:
item_features, user_features, recommendations = read_data('cb_data')

### Изменение типов данных матрицы recommendations

In [4]:
recommendations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12742450 entries, 0 to 12742449
Data columns (total 3 columns):
 #   Column       Dtype  
---  ------       -----  
 0   score        float32
 1   user_uid     int64  
 2   element_uid  int64  
dtypes: float32(1), int64(2)
memory usage: 340.3 MB


In [5]:
def change_recommendation_dtype(recommendations: pd.DataFrame) -> pd.DataFrame:
    recommendations['score'] = recommendations['score'].astype(np.float32)
    return recommendations

In [6]:
recommendations = change_recommendation_dtype(recommendations)

### Feature Importance

In [7]:
from sklearn.model_selection import train_test_split
from catboost import (
    CatBoostRegressor, 
    Pool
)
from catboost.metrics import (
    MAE, 
)

def create_small_dataset(recommendations: pd.DataFrame, item_features:pd.DataFrame, user_features: pd.DataFrame) -> pd.DataFrame:
    recommendations['user_rank'] = (
        recommendations
        .groupby('user_uid')['element_uid']
        .rank('first', ascending=False)
        .astype(np.int8)
    )
    small_recommendations = (
        recommendations
        .query('user_rank >= 48')
        .drop(columns='user_rank')
    )
    small_dataset = (
        small_recommendations
        .merge(
            item_features,
            on='element_uid',
            how='inner'
        )
        .merge(
            user_features,
            on='user_uid',
            how='inner'
        )
    )  
    recommendations.drop(columns='user_rank', inplace=True)

    return small_dataset

def get_list_features_importance(small_dataset: pd.DataFrame) -> list:
    features_importance_train, features_importance_test = train_test_split(
        small_dataset.drop(columns=['user_uid', 'element_uid']), 
        test_size=0.2, 
        random_state=777
    )

    features_importance_train_X = features_importance_train.drop(columns=['score'])
    features_importance_train_y = features_importance_train['score']

    features_importance_test_X = features_importance_test.drop(columns=['score'])
    features_importance_test_y = features_importance_test['score']


    cb_pool_train = Pool(features_importance_train_X, features_importance_train_y)
    cb_pool_test = Pool(features_importance_test_X, features_importance_test_y)

    cb_reg = CatBoostRegressor(
        iterations=550,
        eval_metric=MAE(),
    )

    cb_reg.fit(cb_pool_train, eval_set=cb_pool_test, verbose=1)
    
    list_feature_importance = list(sorted(zip(features_importance_train_X.columns, cb_reg.feature_importances_), key=lambda x: -x[1]))

    return list_feature_importance

    
def get_important_feature(recommendations: pd.DataFrame, item_features:pd.DataFrame, user_features: pd.DataFrame) -> Tuple[list, list]:
    small_dataset = create_small_dataset(recommendations, item_features, user_features)
    list_feature_importance = get_list_features_importance(small_dataset)

    important_features = [i[0] for i in list_feature_importance if i[1] > 0.5]
    item_important_features = list(set(important_features) & set(item_features.columns))
    item_important_features.append('element_uid')

    user_important_feature = list(set(important_features) & set(user_features.columns))
    user_important_feature.append('user_uid')

    return user_important_feature, item_important_features, list_feature_importance
    

In [8]:
user_important_feature, item_important_features, l = get_important_feature(recommendations, item_features, user_features)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Learning rate set to 0.200796
0:	learn: 0.0148674	test: 0.0147313	best: 0.0147313 (0)	total: 213ms	remaining: 1m 57s
1:	learn: 0.0143688	test: 0.0142301	best: 0.0142301 (1)	total: 328ms	remaining: 1m 29s
2:	learn: 0.0139656	test: 0.0138339	best: 0.0138339 (2)	total: 434ms	remaining: 1m 19s
3:	learn: 0.0137477	test: 0.0136224	best: 0.0136224 (3)	total: 540ms	remaining: 1m 13s
4:	learn: 0.0135722	test: 0.0134513	best: 0.0134513 (4)	total: 631ms	remaining: 1m 8s
5:	learn: 0.0133983	test: 0.0132816	best: 0.0132816 (5)	total: 745ms	remaining: 1m 7s
6:	learn: 0.0132491	test: 0.0131389	best: 0.0131389 (6)	total: 850ms	remaining: 1m 5s
7:	learn: 0.0131701	test: 0.0130618	best: 0.0130618 (7)	total: 963ms	remaining: 1m 5s
8:	learn: 0.0130793	test: 0.0129748	best: 0.0129748 (8)	total: 1.07s	remaining: 1m 4s
9:	learn: 0.0130426	test: 0.0129387	best: 0.0129387 (9)	total: 1.18s	remaining: 1m 3s
10:	learn: 0.0130132	test: 0.0129098	best: 0.0129098 (10)	total: 1.27s	remaining: 1m 2s
11:	learn: 0.01296

In [9]:
l

[('attribute26', 25.37755139287477),
 ('attribute11', 21.35552414892559),
 ('user_watch_count', 13.823938389118634),
 ('element_bookmark_count', 11.373007583153779),
 ('user_watch_time_mean', 5.316022906406965),
 ('popularity', 3.4152188991066343),
 ('favorite_consumption_mode', 1.8327077191723746),
 ('attribute25', 1.815406916227496),
 ('favorite_device_type', 1.610934949980072),
 ('feature_5', 1.2621646811433074),
 ('attribute7', 1.2506118875147285),
 ('attribute1', 1.0984442445880769),
 ('feature_4', 1.0771067147331002),
 ('feature_3', 1.0498176729681656),
 ('attribute19', 0.925567555842694),
 ('duration', 0.7955863080835943),
 ('feature_1', 0.7597805084030292),
 ('attribute5', 0.5850904468022486),
 ('attribute23', 0.5792619938088328),
 ('attribute15', 0.49469413985870886),
 ('feature_2', 0.48369491504058976),
 ('attribute9', 0.384467791787618),
 ('attribute24', 0.37233515240226883),
 ('attribute4', 0.30321585068205975),
 ('attribute16', 0.29768923067014924),
 ('attribute27', 0.2758

In [19]:
[i[0] for i in l if i[1] > 0.5]

['attribute26',
 'attribute11',
 'user_watch_count',
 'element_bookmark_count',
 'user_watch_time_mean',
 'popularity',
 'favorite_consumption_mode',
 'attribute25',
 'favorite_device_type',
 'feature_5',
 'attribute7',
 'attribute1',
 'feature_4',
 'feature_3',
 'attribute19',
 'duration',
 'feature_1',
 'attribute5',
 'attribute23']

In [11]:
user_important_feature

['user_watch_count',
 'favorite_consumption_mode',
 'user_watch_time_mean',
 'favorite_device_type',
 'user_uid']

In [12]:
item_important_features

['attribute7',
 'attribute25',
 'attribute26',
 'element_bookmark_count',
 'feature_5',
 'attribute1',
 'feature_1',
 'attribute23',
 'duration',
 'attribute19',
 'feature_3',
 'popularity',
 'attribute5',
 'attribute11',
 'feature_4',
 'element_uid']

### Соединение матриц user_features, item_features, recommendations

In [13]:
gbt_features = (
    recommendations
    .merge(
        item_features[item_important_features],
        on='element_uid',
        how='left'
    )
    .merge(
        user_features[user_important_feature],
        on='user_uid',
        how='left'
    )
)

In [14]:
gbt_features

Unnamed: 0,score,user_uid,element_uid,attribute7,attribute25,attribute26,element_bookmark_count,feature_5,attribute1,feature_1,...,attribute19,feature_3,popularity,attribute5,attribute11,feature_4,user_watch_count,favorite_consumption_mode,user_watch_time_mean,favorite_device_type
0,0.089123,0,1650,42,0,0,340,0.680410,143,41661080.0,...,0,13,1.307730,7,131,1.140273,92,2,4227.402344,0
1,0.071904,0,603,1581,0,0,1713,0.680410,4865,42346732.0,...,21,26,1.112551,437,42,1.141929,92,2,4227.402344,0
2,0.066893,0,5665,22136,0,0,1409,0.654707,214,42798304.0,...,0,16,1.406348,7,131,1.141929,92,2,4227.402344,0
3,0.057513,0,3790,308,0,0,556,0.680410,549,42318744.0,...,0,17,0.862733,7,131,1.141929,92,2,4227.402344,0
4,0.056760,0,3757,42,0,0,789,0.654707,144,41241096.0,...,0,16,1.711049,7,17,1.140273,92,2,4227.402344,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12742445,0.012662,593489,7726,246,0,0,165,0.449667,9937,16859362.0,...,21,24,0.885389,9941,1320,1.131807,25,2,3480.040039,0
12742446,0.012458,593489,1692,10,0,0,89,0.449667,10052,42962180.0,...,0,15,0.202568,270,17,1.140273,25,2,3480.040039,0
12742447,0.012314,593489,719,1383,0,0,151,0.654707,10043,38036868.0,...,0,20,0.600315,270,17,1.140273,25,2,3480.040039,0
12742448,0.012259,593489,5515,43,0,0,115,0.654707,18163,36077908.0,...,0,30,0.521323,270,18,1.140273,25,2,3480.040039,0


### Разбиение gbt_features на test и train

In [15]:
gbt_features_train, gbt_features_test = train_test_split(
    gbt_features.drop(columns=['user_uid', 'element_uid']), 
    test_size=0.2, 
    random_state=777
)

gbt_features_train_X = gbt_features_train.drop(columns=['score'])
gbt_features_train_y = gbt_features_train['score']

gbt_features_test_X = gbt_features_test.drop(columns=['score'])
gbt_features_test_y = gbt_features_test['score']

In [16]:
gbt_features_train_X.shape

(10193960, 19)

## Обучение CatBoostRegressor

In [20]:
cb_pool_train = Pool(gbt_features_train_X, gbt_features_train_y)
cb_pool_test = Pool(gbt_features_test_X, gbt_features_test_y)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [21]:
cb_reg = CatBoostRegressor(
    iterations=550,
    eval_metric=MAE(),
)

cb_reg.fit(cb_pool_train, eval_set=cb_pool_test, verbose=1)

Learning rate set to 0.312309
0:	learn: 0.0124507	test: 0.0124302	best: 0.0124302 (0)	total: 1.89s	remaining: 17m 15s
1:	learn: 0.0122209	test: 0.0122010	best: 0.0122010 (1)	total: 3.2s	remaining: 14m 36s
2:	learn: 0.0119279	test: 0.0119087	best: 0.0119087 (2)	total: 4.55s	remaining: 13m 49s
3:	learn: 0.0117610	test: 0.0117432	best: 0.0117432 (3)	total: 5.74s	remaining: 13m 2s
4:	learn: 0.0116260	test: 0.0116097	best: 0.0116097 (4)	total: 6.93s	remaining: 12m 35s
5:	learn: 0.0115051	test: 0.0114886	best: 0.0114886 (5)	total: 8.3s	remaining: 12m 32s
6:	learn: 0.0114307	test: 0.0114146	best: 0.0114146 (6)	total: 9.59s	remaining: 12m 23s
7:	learn: 0.0113462	test: 0.0113304	best: 0.0113304 (7)	total: 10.9s	remaining: 12m 16s
8:	learn: 0.0112996	test: 0.0112841	best: 0.0112841 (8)	total: 12.4s	remaining: 12m 23s
9:	learn: 0.0112561	test: 0.0112412	best: 0.0112412 (9)	total: 13.5s	remaining: 12m 10s
10:	learn: 0.0112200	test: 0.0112051	best: 0.0112051 (10)	total: 14.9s	remaining: 12m 11s
11:

<catboost.core.CatBoostRegressor at 0x7f75bffed3a0>

In [22]:
cb_reg.save_model('model/catboost.cbm')

## Обучение CatBoostRegressor на всем датасете

In [23]:
gbt_features

Unnamed: 0,score,user_uid,element_uid,attribute7,attribute25,attribute26,element_bookmark_count,feature_5,attribute1,feature_1,...,attribute19,feature_3,popularity,attribute5,attribute11,feature_4,user_watch_count,favorite_consumption_mode,user_watch_time_mean,favorite_device_type
0,0.089123,0,1650,42,0,0,340,0.680410,143,41661080.0,...,0,13,1.307730,7,131,1.140273,92,2,4227.402344,0
1,0.071904,0,603,1581,0,0,1713,0.680410,4865,42346732.0,...,21,26,1.112551,437,42,1.141929,92,2,4227.402344,0
2,0.066893,0,5665,22136,0,0,1409,0.654707,214,42798304.0,...,0,16,1.406348,7,131,1.141929,92,2,4227.402344,0
3,0.057513,0,3790,308,0,0,556,0.680410,549,42318744.0,...,0,17,0.862733,7,131,1.141929,92,2,4227.402344,0
4,0.056760,0,3757,42,0,0,789,0.654707,144,41241096.0,...,0,16,1.711049,7,17,1.140273,92,2,4227.402344,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12742445,0.012662,593489,7726,246,0,0,165,0.449667,9937,16859362.0,...,21,24,0.885389,9941,1320,1.131807,25,2,3480.040039,0
12742446,0.012458,593489,1692,10,0,0,89,0.449667,10052,42962180.0,...,0,15,0.202568,270,17,1.140273,25,2,3480.040039,0
12742447,0.012314,593489,719,1383,0,0,151,0.654707,10043,38036868.0,...,0,20,0.600315,270,17,1.140273,25,2,3480.040039,0
12742448,0.012259,593489,5515,43,0,0,115,0.654707,18163,36077908.0,...,0,30,0.521323,270,18,1.140273,25,2,3480.040039,0


In [24]:
cb_reg_full = CatBoostRegressor(
    iterations=550,
    eval_metric=MAE(),
)

gbt_features_X = gbt_features.drop(columns=['user_uid', 'element_uid', 'score'])
gbt_features_y = gbt_features['score']

cb_pool = Pool(
    gbt_features_X,
    gbt_features_y
)

cb_reg_full.fit(cb_pool)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Learning rate set to 0.296411
0:	learn: 0.0124693	total: 3.25s	remaining: 29m 45s
1:	learn: 0.0122579	total: 4.83s	remaining: 22m 4s
2:	learn: 0.0119581	total: 6.8s	remaining: 20m 40s
3:	learn: 0.0118153	total: 8.68s	remaining: 19m 44s
4:	learn: 0.0116549	total: 10.8s	remaining: 19m 34s
5:	learn: 0.0115348	total: 12.7s	remaining: 19m 13s
6:	learn: 0.0114289	total: 14.9s	remaining: 19m 16s
7:	learn: 0.0113587	total: 16.7s	remaining: 18m 54s
8:	learn: 0.0113046	total: 18.5s	remaining: 18m 34s
9:	learn: 0.0112588	total: 20.4s	remaining: 18m 22s
10:	learn: 0.0112172	total: 22.4s	remaining: 18m 17s
11:	learn: 0.0111695	total: 24.3s	remaining: 18m 8s
12:	learn: 0.0111461	total: 25.9s	remaining: 17m 48s
13:	learn: 0.0111136	total: 27.7s	remaining: 17m 42s
14:	learn: 0.0110906	total: 29.2s	remaining: 17m 22s
15:	learn: 0.0110672	total: 31.4s	remaining: 17m 29s
16:	learn: 0.0110460	total: 33.3s	remaining: 17m 25s
17:	learn: 0.0110260	total: 35.4s	remaining: 17m 26s
18:	learn: 0.0110098	total: 3

<catboost.core.CatBoostRegressor at 0x7f75bffed2e0>

In [26]:
cb_reg_full.save_model('model/catboost_full.cbm')

: 