# Импорт данных

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from catboost import Pool, CatBoostClassifier
import numpy as np
import re
from string import punctuation
from sklearn.metrics import roc_curve, auc

# Загрузка данных

In [2]:
engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )

# Чтение данных таблицы user_data
query = "SELECT * FROM user_data"
user_data = pd.read_sql(query, engine)

# Чтение данных таблицы post_text_df
query = "SELECT * FROM post_text_df"
post_text_df = pd.read_sql(query, engine)

# Чтение ограниченного количества данных таблицы feed_data
query = "SELECT * FROM feed_data LIMIT 1000000"
feed_data = pd.read_sql(query, engine)

# Переименование столбцов идентификаторов
user_data = user_data.rename(columns={'id': 'user_id'})
post_text_df = post_text_df.rename(columns={'id': 'post_id'})

# Объединение таблиц
data = feed_data.merge(user_data, on='user_id', how='left')
data = data.merge(post_text_df, on='post_id', how='left')

In [3]:
data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic
0,2021-12-21 16:02:35,11408,4547,view,0,1,20,Russia,Ivanovo,4,iOS,ads,"To be honest, I thought this movie would be a ...",movie
1,2021-12-21 16:05:27,11408,2580,view,0,1,20,Russia,Ivanovo,4,iOS,ads,Cop brags about the overtime money in a boarde...,covid
2,2021-12-21 16:06:53,11408,6318,view,0,1,20,Russia,Ivanovo,4,iOS,ads,Ive never really considered myself much of stu...,movie
3,2021-12-21 16:08:19,11408,3234,view,0,1,20,Russia,Ivanovo,4,iOS,ads,The total number of #COVID19 samples tested up...,covid
4,2021-12-21 16:09:31,11408,4521,view,0,1,20,Russia,Ivanovo,4,iOS,ads,This movie is just crap. Even though the direc...,movie


# Обработка временных меток

In [4]:
# Преобразование формата временных меток в объект datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Извлечение признаков из временных меток
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['hour_of_day'] = data['timestamp'].dt.hour

# Расчет времени с момента последнего действия для каждого пользователя
data = data.sort_values(['user_id', 'timestamp'])
data['time_since_last_action'] = data.groupby('user_id')['timestamp'].diff().dt.total_seconds()
data['time_since_last_action'].fillna(0, inplace=True)

# Удаление столбца временных меток
data = data.drop('timestamp', axis=1)

In [5]:
data.head()

Unnamed: 0,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic,day_of_week,hour_of_day,time_since_last_action
0,11408,4547,view,0,1,20,Russia,Ivanovo,4,iOS,ads,"To be honest, I thought this movie would be a ...",movie,1,16,0.0
1,11408,2580,view,0,1,20,Russia,Ivanovo,4,iOS,ads,Cop brags about the overtime money in a boarde...,covid,1,16,172.0
2,11408,6318,view,0,1,20,Russia,Ivanovo,4,iOS,ads,Ive never really considered myself much of stu...,movie,1,16,86.0
3,11408,3234,view,0,1,20,Russia,Ivanovo,4,iOS,ads,The total number of #COVID19 samples tested up...,covid,1,16,86.0
4,11408,4521,view,0,1,20,Russia,Ivanovo,4,iOS,ads,This movie is just crap. Even though the direc...,movie,1,16,72.0


# Кодирование категориальных признаков

In [6]:
# Используем label encoder для кодирования категориальных признаков
le_gender = LabelEncoder()
le_os = LabelEncoder()
le_source = LabelEncoder()
le_action = LabelEncoder()
le_topic = LabelEncoder()

# Label encoding для 'gender', 'os' и 'source'
data['gender'] = le_gender.fit_transform(data['gender'])
data['os'] = le_os.fit_transform(data['os'])
data['source'] = le_source.fit_transform(data['source'])
data['action'] = le_action.fit_transform(data['action'])
data['topic']

0         movie
1         covid
2         movie
3         covid
4         movie
          ...  
935919    covid
935920    sport
935921    covid
935922    sport
935923    movie
Name: topic, Length: 1000000, dtype: object

In [7]:
data.head()

Unnamed: 0,user_id,post_id,action,target,gender,age,country,city,exp_group,os,source,text,topic,day_of_week,hour_of_day,time_since_last_action
0,11408,4547,1,0,1,20,Russia,Ivanovo,4,1,0,"To be honest, I thought this movie would be a ...",movie,1,16,0.0
1,11408,2580,1,0,1,20,Russia,Ivanovo,4,1,0,Cop brags about the overtime money in a boarde...,covid,1,16,172.0
2,11408,6318,1,0,1,20,Russia,Ivanovo,4,1,0,Ive never really considered myself much of stu...,movie,1,16,86.0
3,11408,3234,1,0,1,20,Russia,Ivanovo,4,1,0,The total number of #COVID19 samples tested up...,covid,1,16,86.0
4,11408,4521,1,0,1,20,Russia,Ivanovo,4,1,0,This movie is just crap. Even though the direc...,movie,1,16,72.0


# Обучение модели CatBoost

## Train-test split

Этот код формирует выборку данных с заданными признаками, выбирая топ-k признаков с использованием взаимной информации, без утечки данных, временной метки, 'action' и 'text'. Затем данные разбиваются на обучающую и тестовую выборки с заданным отношением размеров, и рандомным состоянием генератора псевдослучайных чисел.

In [8]:
X = data.drop(['target', 'action', 'text'], axis=1)

In [9]:
X

Unnamed: 0,user_id,post_id,gender,age,country,city,exp_group,os,source,topic,day_of_week,hour_of_day,time_since_last_action
0,11408,4547,1,20,Russia,Ivanovo,4,1,0,movie,1,16,0.0
1,11408,2580,1,20,Russia,Ivanovo,4,1,0,covid,1,16,172.0
2,11408,6318,1,20,Russia,Ivanovo,4,1,0,movie,1,16,86.0
3,11408,3234,1,20,Russia,Ivanovo,4,1,0,covid,1,16,86.0
4,11408,4521,1,20,Russia,Ivanovo,4,1,0,movie,1,16,72.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
935919,166351,3210,0,21,Russia,Omsk,4,0,1,covid,4,15,597359.0
935920,166351,1512,0,21,Russia,Omsk,4,0,1,sport,4,15,173.0
935921,166351,3647,0,21,Russia,Omsk,4,0,1,covid,4,15,41.0
935922,166351,1463,0,21,Russia,Omsk,4,0,1,sport,4,15,24.0


In [10]:
# Убираем ненужные столбцы и выби
X = data.drop(['target', 'action', 'text'], axis=1)

# Выборка целевой переменной
y = data['target']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Целевой кодировщик с предварительным сглаживанием.
Источник: https://towardsdatascience.com/dealing-with-categorical-variables-by-using-target-encoder-a0f1733a4c69

## Обучение модели на Precision@5 

Мы создаем группы данных на основе идентификатора пользователя 'user_id', чтобы иметь возможность проводить обучение с учетом группировки данных. Затем мы сортируем данные по группам и создаем объекты Pool для обучения и тестирования с колонкой 'group_id', которые затем будут использоваться для обучения модели и оценки ее производительности.

In [11]:
# Убираем ненужные столбцы и выби
X = data.drop(['target', 'action', 'text'], axis=1)

# Выборка целевой переменной
y = data['target']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Замените названия столбцов на соответствующие вашим данным
categorical_columns = ['country', 'topic', 'city', 'gender', 'os', 'source']
cat_features = [X_train.columns.get_loc(col) for col in categorical_columns]

# Создание ID группы на основе столбца 'user_id'
unique_user_ids = X_train['user_id'].unique()
group_id_dict = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
X_train['group_id'] = X_train['user_id'].map(group_id_dict)
X_test['group_id'] = X_test['user_id'].map(group_id_dict)

# Сортировка наборов данных для обучения и тестирования по 'group_id'
X_train = X_train.sort_values(by='group_id')
y_train = y_train.loc[X_train.index]

X_test = X_test.sort_values(by='group_id')
y_test = y_test.loc[X_test.index]

# Убедитесь, что категориальные переменные представлены в виде строк
categorical_columns = ['country', 'topic', 'city', 'gender', 'os', 'source']
X_train[categorical_columns] = X_train[categorical_columns].astype(str)
X_test[categorical_columns] = X_test[categorical_columns].astype(str)

# Получение индексов категориальных столбцов
cat_features = [X_train.drop(columns=['user_id']).columns.get_loc(col) for col in categorical_columns]

# Создание объектов Pool для обучающей и тестовой выборок с колонкой 'group_id' и категориальными признаками
train_pool = Pool(X_train.drop(columns=['user_id']), y_train, cat_features=cat_features, group_id=X_train['group_id'])
test_pool = Pool(X_test.drop(columns=['user_id']), y_test, cat_features=cat_features, group_id=X_test['group_id'])

In [13]:
# Обучение модели CatBoost с использованием метрики PrecisionAt:top=5
from catboost import CatBoostClassifier

precision_model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           custom_metric='PrecisionAt:top=5',
                           eval_metric='PrecisionAt:top=5',
                           random_seed=42,
                           verbose=100)

precision_model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.0376284	test: 0.0521242	best: 0.0521242 (0)	total: 303ms	remaining: 5m 3s
100:	learn: 0.1925303	test: 0.1818161	best: 0.1834034 (81)	total: 13s	remaining: 1m 55s
200:	learn: 0.2171802	test: 0.1852708	best: 0.1857376 (162)	total: 26.9s	remaining: 1m 47s
300:	learn: 0.2350140	test: 0.1843371	best: 0.1857376 (162)	total: 41.6s	remaining: 1m 36s
400:	learn: 0.2518207	test: 0.1809757	best: 0.1857376 (162)	total: 56.8s	remaining: 1m 24s
500:	learn: 0.2653595	test: 0.1819094	best: 0.1857376 (162)	total: 1m 11s	remaining: 1m 11s
600:	learn: 0.2766573	test: 0.1813492	best: 0.1857376 (162)	total: 1m 26s	remaining: 57.6s
700:	learn: 0.2858077	test: 0.1807890	best: 0.1857376 (162)	total: 1m 41s	remaining: 43.4s
800:	learn: 0.2974790	test: 0.1822829	best: 0.1857376 (162)	total: 1m 56s	remaining: 28.9s
900:	learn: 0.3059757	test: 0.1812558	best: 0.1857376 (162)	total: 2m 11s	remaining: 14.4s
999:	learn: 0.3126984	test: 0.1816293	best: 0.1857376 (162)	total: 2m 26s	remaining: 0us

bestTes

<catboost.core.CatBoostClassifier at 0x245050fc550>

# Сохранение и загрузка модели CatBoost

In [14]:
precision_model.save_model('catboost_precision_model.cbm')

In [15]:
from catboost import CatBoostClassifier

# Загрузка сохраненной модели
loaded_model = CatBoostClassifier()
loaded_model.load_model('catboost_precision_model.cbm')

# Предсказание с использованием загруженной модели на тестовом наборе данных
predictions = loaded_model.predict(test_pool)

# Оценка загруженной модели на тестовом наборе данных
score = loaded_model.score(test_pool)
print("Accuracy:", score)

# Вычисление других метрик, если это необходимо
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.892425
Precision: 0.796422380625
Recall: 0.892425
F1-score: 0.8416950533046222


  _warn_prf(average, modifier, msg_start, len(result))


## Обучение модели на Recall@5 

In [16]:
recall_model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           custom_metric='RecallAt:top=5',
                           eval_metric='RecallAt:top=5',
                           random_seed=42,
                           verbose=100)

recall_model.fit(train_pool, eval_set=test_pool)


0:	learn: 0.0072233	test: 0.0536633	best: 0.0536633 (0)	total: 150ms	remaining: 2m 30s
100:	learn: 0.0323096	test: 0.1305029	best: 0.1314942 (81)	total: 12.9s	remaining: 1m 55s
200:	learn: 0.0358339	test: 0.1333219	best: 0.1333219 (200)	total: 26.7s	remaining: 1m 46s
300:	learn: 0.0381929	test: 0.1331771	best: 0.1339299 (231)	total: 41.2s	remaining: 1m 35s
400:	learn: 0.0410799	test: 0.1307907	best: 0.1339299 (231)	total: 56s	remaining: 1m 23s
500:	learn: 0.0435274	test: 0.1315092	best: 0.1339299 (231)	total: 1m 10s	remaining: 1m 10s
600:	learn: 0.0459674	test: 0.1307759	best: 0.1339299 (231)	total: 1m 26s	remaining: 57.1s
700:	learn: 0.0473400	test: 0.1314657	best: 0.1339299 (231)	total: 1m 40s	remaining: 43s
800:	learn: 0.0487120	test: 0.1333418	best: 0.1339299 (231)	total: 1m 55s	remaining: 28.7s
900:	learn: 0.0501606	test: 0.1323055	best: 0.1339299 (231)	total: 2m 10s	remaining: 14.3s
999:	learn: 0.0514421	test: 0.1323504	best: 0.1339299 (231)	total: 2m 25s	remaining: 0us

bestTest

<catboost.core.CatBoostClassifier at 0x245050fd270>

In [17]:
recall_model.save_model('catboost_recall_model.cbm')

## Обучение модели на MAP@5 

In [18]:
model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           custom_metric='PFound:top=5',
                           eval_metric='PFound:top=5',
                           random_seed=42,
                           verbose=100)

model.fit(train_pool, eval_set=test_pool)


0:	test: 0.1576056	best: 0.1576056 (0)	total: 152ms	remaining: 2m 31s
100:	test: 0.4926747	best: 0.4935747 (80)	total: 12.3s	remaining: 1m 49s
200:	test: 0.5006843	best: 0.5016528 (192)	total: 25.6s	remaining: 1m 41s
300:	test: 0.4992228	best: 0.5017593 (203)	total: 39.4s	remaining: 1m 31s
400:	test: 0.4932243	best: 0.5017593 (203)	total: 53.8s	remaining: 1m 20s


In [None]:
model.save_model('catboost_MAP_model.cbm')

- bestTest = 0.4451217663 for MAP@5
- bestTest = 0.1603712149 for Recall@5
- bestTest = 0.1660869565 for Precision@5

# Сравнение моделей

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

models = {
    'recall': 'catboost_recall_model.cbm',
    'MAP': 'catboost_MAP_model.cbm',
    'precision': 'catboost_precision_model.cbm',
}

metrics = {}

for model_name, model_path in models.items():
    loaded_model = CatBoostClassifier()
    loaded_model.load_model(model_path)

    predictions = loaded_model.predict(test_pool)

    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')

    metrics[model_name] = {
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
    }

# Вывод метрик для каждой модели
for model_name, model_metrics in metrics.items():
    print(f"{model_name} model:")
    for metric_name, metric_value in model_metrics.items():
        print(f"  {metric_name}: {metric_value:.4f}")
    print()


recall model:
  Precision: 0.8522
  Recall: 0.8940
  F1-score: 0.8440

MAP model:
  Precision: 0.8257
  Recall: 0.8940
  F1-score: 0.8440

precision model:
  Precision: 0.7992
  Recall: 0.8940
  F1-score: 0.8440



  _warn_prf(average, modifier, msg_start, len(result))


Для выбора модели, которая будет оцениваться по Hitrate@5, нужно посмотреть на метрику PrecisionAt:top=5 для каждой модели. Чем выше PrecisionAt:top=5, тем лучше модель справляется с задачей рекомендации топ-5 элементов.

Из предоставленных результатов, мы видим следующую картину:

- recall model: Precision: 0.8522
- MAP model: Precision: 0.8257
- precision model: Precision: 0.7992

Исходя из этой информации, лучшей моделью для оценки по метрике Hitrate@5 будет модель recall, так как у нее наивысшая точность (Precision) среди всех моделей. Мы будем использовать эту модель для рекомендации топ-5 элементов в вашей задаче.