# Импорт данных

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from catboost import Pool, CatBoostClassifier
import numpy as np
import re
from string import punctuation
from sklearn.metrics import roc_curve, auc

# Загрузка данных

In [2]:
engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )

# Чтение данных таблицы user_data
query = "SELECT * FROM user_data"
user_data = pd.read_sql(query, engine)

# Чтение данных таблицы post_text_df
query = "SELECT * FROM post_text_df"
post_text_df = pd.read_sql(query, engine)

# Чтение ограниченного количества данных таблицы feed_data
query = "SELECT * FROM feed_data LIMIT 1000000"
feed_data = pd.read_sql(query, engine)

# Переименование столбцов идентификаторов
user_data = user_data.rename(columns={'id': 'user_id'})
post_text_df = post_text_df.rename(columns={'id': 'post_id'})

# Объединение таблиц
data = feed_data.merge(user_data, on='user_id', how='left')
data = data.merge(post_text_df, on='post_id', how='left')

In [3]:
data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic
0,2021-12-27 20:33:09,76148,2932,view,0,0,17,Russia,Khabarovsk,3,iOS,ads,Save your family from financial instability in...,covid
1,2021-12-27 20:35:50,76148,3170,view,0,0,17,Russia,Khabarovsk,3,iOS,ads,#CoronaInfoCH #COVID19 #corona \nYour saved po...,covid
2,2021-12-27 20:38:47,76148,5835,view,0,0,17,Russia,Khabarovsk,3,iOS,ads,I presume you are here for damage to your ment...,movie
3,2021-12-27 20:39:54,76148,9,view,0,0,17,Russia,Khabarovsk,3,iOS,ads,WorldCom director admits lying\n\nThe former c...,business
4,2021-12-27 20:41:33,76148,1370,view,0,0,17,Russia,Khabarovsk,3,iOS,ads,Peace demo appeal rejected\n\nPeace protestors...,politics


# Обработка временных меток

In [4]:
# Преобразование формата временных меток в объект datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Извлечение признаков из временных меток
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['hour_of_day'] = data['timestamp'].dt.hour

# Расчет времени с момента последнего действия для каждого пользователя
data = data.sort_values(['user_id', 'timestamp'])
data['time_since_last_action'] = data.groupby('user_id')['timestamp'].diff().dt.total_seconds()
data['time_since_last_action'].fillna(0, inplace=True)

# Удаление столбца временных меток
data = data.drop('timestamp', axis=1)

In [5]:
data.head()

Unnamed: 0,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic,day_of_week,hour_of_day,time_since_last_action
557,18358,2748,view,0,0,35,Russia,Bataysk,4,iOS,ads,👀 We see you! More and more of you are wearing...,covid,4,9,0.0
558,18358,1775,view,0,0,35,Russia,Bataysk,4,iOS,ads,Hewitt fights back to reach final\n\nLleyton H...,sport,4,9,87.0
559,18358,1639,view,0,0,35,Russia,Bataysk,4,iOS,ads,South Africa sweep top awards\n\nSouth Africas...,sport,4,9,128.0
560,18358,1211,view,0,0,35,Russia,Bataysk,4,iOS,ads,Jowell confirms casino climbdown\n\nTessa Jowe...,politics,4,9,26.0
561,18358,1116,view,0,0,35,Russia,Bataysk,4,iOS,ads,Blairs hope for Blunkett return\n\nThe events ...,politics,4,9,81.0


# Text features

In [8]:
# Количество слов в тексте
data['word_count'] = data['text'].apply(lambda x: len(x.split()))

# Длина текста
data['text_length'] = data['text'].apply(len)

# Средняя длина слова
data['avg_word_length'] = data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))

# Количество уникальных слов
data['unique_word_count'] = data['text'].apply(lambda x: len(set(x.split())))

# Количество предложений
data['sentence_count'] = data['text'].apply(lambda x: x.count('.') + x.count('?') + x.count('!'))


# Обучение модели CatBoost

## Train-test split

Этот код формирует выборку данных с заданными признаками, выбирая топ-k признаков с использованием взаимной информации, без утечки данных, временной метки, 'action' и 'text'. Затем данные разбиваются на обучающую и тестовую выборки с заданным отношением размеров, и рандомным состоянием генератора псевдослучайных чисел.

In [9]:
X = data.drop(['target', 'action', 'text'], axis=1)

In [10]:
X

Unnamed: 0,user_id,post_id,gender,age,country,city,exp_group,os,source,topic,day_of_week,hour_of_day,time_since_last_action,word_count,text_length,avg_word_length,unique_word_count,sentence_count
557,18358,2748,0,35,Russia,Bataysk,4,iOS,ads,covid,4,9,0.0,26,138,4.346154,26,3
558,18358,1775,0,35,Russia,Bataysk,4,iOS,ads,sport,4,9,87.0,416,2355,4.651442,251,24
559,18358,1639,0,35,Russia,Bataysk,4,iOS,ads,sport,4,9,128.0,213,1266,4.934272,152,10
560,18358,1211,0,35,Russia,Bataysk,4,iOS,ads,politics,4,9,26.0,389,2296,4.892031,240,17
561,18358,1116,0,35,Russia,Bataysk,4,iOS,ads,politics,4,9,81.0,249,1423,4.702811,164,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884027,160563,5692,1,15,Russia,Nazyvayevsk,0,Android,organic,movie,2,21,2.0,524,2926,4.585878,314,35
884028,160563,1440,1,15,Russia,Nazyvayevsk,0,Android,organic,sport,2,21,110.0,373,1983,4.308311,213,29
884029,160563,1440,1,15,Russia,Nazyvayevsk,0,Android,organic,sport,2,21,158.0,373,1983,4.308311,213,29
884030,160563,5072,1,15,Russia,Nazyvayevsk,0,Android,organic,movie,2,21,2.0,138,710,4.152174,85,9


In [11]:
# Выборка целевой переменной
y = data['target']

# Убедиться, что каждый столбец уникален
X = X.loc[:,~X.columns.duplicated()]

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Целевой кодировщик с предварительным сглаживанием.
Источник: https://towardsdatascience.com/dealing-with-categorical-variables-by-using-target-encoder-a0f1733a4c69

## Обучение модели на Precision@5 

Мы создаем группы данных на основе идентификатора пользователя 'user_id', чтобы иметь возможность проводить обучение с учетом группировки данных. Затем мы сортируем данные по группам и создаем объекты Pool для обучения и тестирования с колонкой 'group_id', которые затем будут использоваться для обучения модели и оценки ее производительности.

In [13]:
from catboost import CatBoostClassifier, Pool

# Замените названия столбцов на соответствующие вашим данным
categorical_columns = ['country', 'topic', 'city', 'gender', 'os', 'source']
cat_features = [X_train.columns.get_loc(col) for col in categorical_columns]

# Создание train_pool и test_pool с указанием категориальных столбцов
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

precision_model = CatBoostClassifier(iterations=1000,
                                     learning_rate=0.1,
                                     depth=6,
                                     custom_metric='AUC',
                                     eval_metric='AUC',
                                     random_seed=42,
                                     verbose=100)

precision_model.fit(train_pool, eval_set=test_pool)


0:	test: 0.5507996	best: 0.5507996 (0)	total: 295ms	remaining: 4m 54s
100:	test: 0.6490722	best: 0.6490722 (100)	total: 12.5s	remaining: 1m 51s
200:	test: 0.6524111	best: 0.6524252 (197)	total: 26.3s	remaining: 1m 44s
300:	test: 0.6535627	best: 0.6535819 (287)	total: 40.7s	remaining: 1m 34s
400:	test: 0.6541463	best: 0.6541631 (374)	total: 54.6s	remaining: 1m 21s
500:	test: 0.6545997	best: 0.6546233 (498)	total: 1m 8s	remaining: 1m 8s
600:	test: 0.6548925	best: 0.6549070 (593)	total: 1m 23s	remaining: 55.3s
700:	test: 0.6549918	best: 0.6550738 (665)	total: 1m 37s	remaining: 41.7s
800:	test: 0.6552291	best: 0.6552667 (782)	total: 1m 52s	remaining: 27.9s
900:	test: 0.6554919	best: 0.6555436 (861)	total: 2m 6s	remaining: 13.9s
999:	test: 0.6556446	best: 0.6556809 (993)	total: 2m 20s	remaining: 0us

bestTest = 0.6556808818
bestIteration = 993

Shrink model to first 994 iterations.


<catboost.core.CatBoostClassifier at 0x1800513a1a0>

In [14]:
# Создание ID группы на основе столбца 'user_id'
unique_user_ids = X_train['user_id'].unique()
group_id_dict = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
X_train['group_id'] = X_train['user_id'].map(group_id_dict)
X_test['group_id'] = X_test['user_id'].map(group_id_dict)

# Сортировка наборов данных для обучения и тестирования по 'group_id'
X_train = X_train.sort_values(by='group_id')
y_train = y_train.loc[X_train.index]

X_test = X_test.sort_values(by='group_id')
y_test = y_test.loc[X_test.index]

# Убедитесь, что категориальные переменные представлены в виде строк
categorical_columns = ['country', 'topic', 'city', 'gender', 'os', 'source']
X_train[categorical_columns] = X_train[categorical_columns].astype(str)
X_test[categorical_columns] = X_test[categorical_columns].astype(str)

# Получение индексов категориальных столбцов
cat_features = [X_train.drop(columns=['user_id']).columns.get_loc(col) for col in categorical_columns]

# Создание объектов Pool для обучающей и тестовой выборок с колонкой 'group_id' и категориальными признаками
train_pool = Pool(X_train.drop(columns=['user_id']), y_train, cat_features=cat_features, group_id=X_train['group_id'])
test_pool = Pool(X_test.drop(columns=['user_id']), y_test, cat_features=cat_features, group_id=X_test['group_id'])


In [15]:
# Обучение модели CatBoost с использованием метрики PrecisionAt:top=5
from catboost import CatBoostClassifier

precision_model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           custom_metric='PrecisionAt:top=5',
                           eval_metric='PrecisionAt:top=5',
                           random_seed=42,
                           verbose=100)

precision_model.fit(train_pool, eval_set=test_pool)


0:	learn: 0.0281716	test: 0.0472235	best: 0.0472235 (0)	total: 144ms	remaining: 2m 24s
100:	learn: 0.2057836	test: 0.1748950	best: 0.1779748 (77)	total: 13.5s	remaining: 2m
200:	learn: 0.2319963	test: 0.1773215	best: 0.1782548 (161)	total: 28.3s	remaining: 1m 52s
300:	learn: 0.2522388	test: 0.1789081	best: 0.1797480 (297)	total: 43.2s	remaining: 1m 40s
400:	learn: 0.2688433	test: 0.1790014	best: 0.1797480 (297)	total: 59.1s	remaining: 1m 28s
500:	learn: 0.2833955	test: 0.1808679	best: 0.1808679 (500)	total: 1m 15s	remaining: 1m 14s
600:	learn: 0.2996269	test: 0.1801213	best: 0.1814279 (559)	total: 1m 30s	remaining: 1m
700:	learn: 0.3096082	test: 0.1765749	best: 0.1814279 (559)	total: 1m 45s	remaining: 45.1s
800:	learn: 0.3242537	test: 0.1776948	best: 0.1814279 (559)	total: 2m 1s	remaining: 30.1s
900:	learn: 0.3357276	test: 0.1784414	best: 0.1814279 (559)	total: 2m 16s	remaining: 15s
999:	learn: 0.3429104	test: 0.1786281	best: 0.1814279 (559)	total: 2m 31s	remaining: 0us

bestTest = 0.1

<catboost.core.CatBoostClassifier at 0x180051382e0>

# Сохранение и загрузка модели CatBoost

In [16]:
precision_model.save_model('catboost_precision_model.cbm')

In [17]:
from catboost import CatBoostClassifier

# Загрузка сохраненной модели
loaded_model = CatBoostClassifier()
loaded_model.load_model('catboost_precision_model.cbm')

# Предсказание с использованием загруженной модели на тестовом наборе данных
predictions = loaded_model.predict(test_pool)

# Оценка загруженной модели на тестовом наборе данных
score = loaded_model.score(test_pool)
print("Accuracy:", score)

# Вычисление других метрик, если это необходимо
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.89395
Precision: 0.7991804651093022
Recall: 0.89395
F1-score: 0.8439129665513874


## Обучение модели на Recall@5 

In [18]:
recall_model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           custom_metric='RecallAt:top=5',
                           eval_metric='RecallAt:top=5',
                           random_seed=42,
                           verbose=100)

recall_model.fit(train_pool, eval_set=test_pool)


0:	learn: 0.0054494	test: 0.0370958	best: 0.0370958 (0)	total: 151ms	remaining: 2m 30s
100:	learn: 0.0338957	test: 0.1229819	best: 0.1247014 (87)	total: 13.2s	remaining: 1m 57s
200:	learn: 0.0375115	test: 0.1240621	best: 0.1250621 (175)	total: 27.8s	remaining: 1m 50s
300:	learn: 0.0409376	test: 0.1243282	best: 0.1253606 (280)	total: 42.6s	remaining: 1m 39s
400:	learn: 0.0438037	test: 0.1252105	best: 0.1253606 (280)	total: 57.7s	remaining: 1m 26s
500:	learn: 0.0461144	test: 0.1266023	best: 0.1266023 (500)	total: 1m 12s	remaining: 1m 12s
600:	learn: 0.0486939	test: 0.1259673	best: 0.1271979 (555)	total: 1m 28s	remaining: 58.7s
700:	learn: 0.0503317	test: 0.1239812	best: 0.1271979 (555)	total: 1m 43s	remaining: 44.2s
800:	learn: 0.0525207	test: 0.1244041	best: 0.1271979 (555)	total: 1m 59s	remaining: 29.7s
900:	learn: 0.0543710	test: 0.1256981	best: 0.1271979 (555)	total: 2m 15s	remaining: 14.8s
999:	learn: 0.0552448	test: 0.1261071	best: 0.1271979 (555)	total: 2m 30s	remaining: 0us

best

<catboost.core.CatBoostClassifier at 0x1800513aad0>

In [19]:
recall_model.save_model('catboost_recall_model.cbm')

## Обучение модели на MAP@5 

In [20]:
model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           custom_metric='PFound:top=5',
                           eval_metric='PFound:top=5',
                           random_seed=42,
                           verbose=100)

model.fit(train_pool, eval_set=test_pool)


0:	test: 0.1555964	best: 0.1555964 (0)	total: 134ms	remaining: 2m 14s
100:	test: 0.4820267	best: 0.4876080 (77)	total: 12.9s	remaining: 1m 54s
200:	test: 0.4850190	best: 0.4888809 (193)	total: 26.9s	remaining: 1m 46s
300:	test: 0.4907975	best: 0.4928924 (297)	total: 40.9s	remaining: 1m 34s
400:	test: 0.4931006	best: 0.4932268 (386)	total: 55.2s	remaining: 1m 22s
500:	test: 0.4959336	best: 0.4961388 (498)	total: 1m 9s	remaining: 1m 9s
600:	test: 0.4932541	best: 0.4965973 (501)	total: 1m 24s	remaining: 56.1s
700:	test: 0.4904980	best: 0.4965973 (501)	total: 1m 39s	remaining: 42.3s
800:	test: 0.4900045	best: 0.4965973 (501)	total: 1m 53s	remaining: 28.3s
900:	test: 0.4917793	best: 0.4965973 (501)	total: 2m 8s	remaining: 14.1s
999:	test: 0.4922730	best: 0.4965973 (501)	total: 2m 23s	remaining: 0us

bestTest = 0.4965973052
bestIteration = 501

Shrink model to first 502 iterations.


<catboost.core.CatBoostClassifier at 0x18005139f90>

In [21]:
model.save_model('catboost_MAP_model.cbm')

- bestTest = 0.4451217663 for MAP@5
- bestTest = 0.1603712149 for Recall@5
- bestTest = 0.1660869565 for Precision@5

# Сравнение моделей

In [22]:
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

models = {
    'recall': 'catboost_recall_model.cbm',
    'MAP': 'catboost_MAP_model.cbm',
    'precision': 'catboost_precision_model.cbm',
}

metrics = {}

for model_name, model_path in models.items():
    loaded_model = CatBoostClassifier()
    loaded_model.load_model(model_path)

    predictions = loaded_model.predict(test_pool)

    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')

    metrics[model_name] = {
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
    }

# Вывод метрик для каждой модели
for model_name, model_metrics in metrics.items():
    print(f"{model_name} model:")
    for metric_name, metric_value in model_metrics.items():
        print(f"  {metric_name}: {metric_value:.4f}")
    print()


recall model:
  Precision: 0.7992
  Recall: 0.8940
  F1-score: 0.8439

MAP model:
  Precision: 0.7992
  Recall: 0.8940
  F1-score: 0.8439

precision model:
  Precision: 0.7992
  Recall: 0.8940
  F1-score: 0.8439



Полюбому можно обработку текста опустить, она выдаёт хуже результаты


Для выбора модели, которая будет оцениваться по Hitrate@5, нужно посмотреть на метрику PrecisionAt:top=5 для каждой модели. Чем выше PrecisionAt:top=5, тем лучше модель справляется с задачей рекомендации топ-5 элементов.

Из предоставленных результатов, мы видим следующую картину:

- recall model: Precision: 0.8522
- MAP model: Precision: 0.8257
- precision model: Precision: 0.7992

Исходя из этой информации, лучшей моделью для оценки по метрике Hitrate@5 будет модель recall, так как у нее наивысшая точность (Precision) среди всех моделей. Мы будем использовать эту модель для рекомендации топ-5 элементов в вашей задаче.