Просьба сначала открыть ноутбук EDA.ipynb, так как в нем происходит обработка и анализ данных

## Import

In [None]:
!pip install catboost

In [107]:
import warnings
import pandas as pd
import numpy as np
import pickle

import optuna

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, recall_score, precision_score

from catboost import CatBoostClassifier

import seaborn as sns
import matplotlib.pyplot as plt

## Function

In [108]:
def metrics(X_test, y_test):
  y_lr_pred = lr_model.predict(X_test)
  y_dec_pred = dec_tree.predict(X_test)
  y_rand_pred = rand_forest.predict(X_test)
  y_cat_pred = cat_boost.predict(X_test)
  y_mlp_pred = mlp.predict(X_test)

  y_pred = {
        'Logistic Regression': y_lr_pred,
        'Decision Tree': y_dec_pred,
        'Random Forest': y_rand_pred,
        'CatBoost': y_cat_pred,
        'MLP': y_mlp_pred
    }
  
  for model_name, preds in y_pred.items():
      print(f'{model_name} - balanced accuracy:', balanced_accuracy_score(y_test, preds))
      print(f'{model_name} -  accuracy:', accuracy_score(y_test, preds))
      print(f'{model_name} -  f1:', f1_score(y_test, preds))
      print(f'{model_name} -  recall:', recall_score(y_test, preds))
      print(f'{model_name} -  precision:', precision_score(y_test, preds))
      print('\n')

def metrics_age(X_test, y_test):
  y_lr_pred = lr_model_age.predict(X_test)
  y_dec_pred = dec_tree_age.predict(X_test)
  y_rand_pred = rand_forest_age.predict(X_test)
  y_cat_pred = cat_boost_age.predict(X_test)
  y_mlp_pred = mlp_age.predict(X_test)

  y_pred = {
        'Logistic Regression': y_lr_pred,
        'Decision Tree': y_dec_pred,
        'Random Forest': y_rand_pred,
        'CatBoost': y_cat_pred,
        'MLP': y_mlp_pred
    }
  
  for model_name, preds in y_pred.items():
      print(f'{model_name} - balanced accuracy:', balanced_accuracy_score(y_test, preds))
      print(f'{model_name} -  accuracy:', accuracy_score(y_test, preds))
      print(f'{model_name} -  f1:', f1_score(y_test, preds,average='weighted'))
      print(f'{model_name} -  recall:', recall_score(y_test, preds, average='weighted'))
      print(f'{model_name} -  precision:', precision_score(y_test, preds, average='weighted'))
      print('\n')


def optuna_optimize(trial):
    for value in MODEL:
        if MODEL_TYPE == 'LogisticRegression':
            params = paramslog(trial)
            model = linear_model.LogisticRegression(**params)

        elif MODEL_TYPE == 'DecisionTreeClassifier':
            params = params_dec(trial)
            model = DecisionTreeClassifier(**params)

        elif MODEL_TYPE == 'RandomForestClassifier':
            params = params_rand(trial)
            model = RandomForestClassifier(**params)

        elif MODEL_TYPE == 'CatBoostClassifier':
            params = params_cat(trial)
            model = CatBoostClassifier(**params, verbose=100)

    model.fit(X_train, y_train)
    score = f1_score(y_test ,model.predict(X_test))
    return score

## Config

In [109]:
RANDOM_STATE = 42

MAX_ITER = 500

N_TRIALS = 3
MODEL = [LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, CatBoostClassifier]
METRIC = "F1"


# optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore")

## Data loading

In [110]:
data_train = pd.read_csv(r'train_events.csv')
data_train.head(5)

Unnamed: 0,event_timestamp,region,ua_device_type,ua_client_type,ua_os,ua_client_name,total_watchtime,rutube_video_id,viewer_uid
0,2024-06-01 06:40:58+03:00,Chelyabinsk,desktop,browser,Windows,Yandex Browser,1883,video_133074,10067243
1,2024-06-01 19:33:24+03:00,Bashkortostan Republic,smartphone,mobile app,Android,Rutube,512,video_362960,10245341
2,2024-06-01 21:30:43+03:00,St.-Petersburg,desktop,browser,Windows,Chrome,5647,video_96775,10894333
3,2024-06-01 23:03:42+03:00,Moscow,smartphone,mobile app,Android,Rutube,1521,video_161610,10029092
4,2024-06-01 22:48:09+03:00,Moscow,smartphone,mobile app,Android,Rutube,71,video_116245,10452976


In [111]:
data_target = pd.read_csv(r'train_targets.csv')
data_target.head(5)

Unnamed: 0,viewer_uid,age,sex,age_class
0,10087154,30,male,1
1,10908708,25,female,1
2,10190464,34,male,2
3,10939673,25,male,1
4,10288257,48,male,3


Так как время в секундах приводим к часам

In [112]:
data_train['total_watchtime'] = data_train['total_watchtime'] / 3600

## Merging data

In [121]:
all_train_data = pd.merge(data_train, data_target, on='viewer_uid', how='outer')

In [103]:
all_train_data.shape

(1759616, 12)

## Проверим на пропуски и дубли

In [114]:
all_train_data.isna().sum()

event_timestamp         0
region                  0
ua_device_type          0
ua_client_type          0
ua_os              117671
ua_client_name          0
total_watchtime         0
rutube_video_id         0
viewer_uid              0
age                     0
sex                     0
age_class               0
dtype: int64

Пропуски в категориальной переменной, поэтому заполним пропуски модой

In [118]:
all_train_data['ua_os'].mode()[0]

'Android'

In [123]:
all_train_data = all_train_data.fillna(all_train_data['ua_os'].mode()[0])

In [124]:
all_train_data.isna().sum()

event_timestamp    0
region             0
ua_device_type     0
ua_client_type     0
ua_os              0
ua_client_name     0
total_watchtime    0
rutube_video_id    0
viewer_uid         0
age                0
sex                0
age_class          0
dtype: int64

In [125]:
all_train_data.duplicated().sum()

0

Полных дубликатов - нет

In [126]:
all_train_data['sex'] = all_train_data['sex'].replace({'female': 0, 'male': 1})
all_train_data['sex'].value_counts()

sex
0    1084515
1     675101
Name: count, dtype: int64

Привели один из таргетов к бинарной классификации (0 - женщина, 1 - мужчина)

In [None]:
all_train_data['ua_device_type'].value_counts() 

In [None]:
all_train_data['ua_client_type'].value_counts() 

Для энкодинга данных параметров будем использовать OHE, так как не сильно увеличим размерность итогового датасета

## Encoding

In [127]:
encoding_data = ['ua_client_type', 'ua_device_type']
encoding_ohe = OneHotEncoder(handle_unknown='ignore')
ohe_data = encoding_ohe.fit_transform(all_train_data[encoding_data])
ohe_df = pd.DataFrame(ohe_data.toarray(), columns=encoding_ohe.get_feature_names_out(encoding_data))

In [128]:
merged_data_ohe = pd.concat([all_train_data.reset_index(drop=True), ohe_df.reset_index(drop=True)], axis=1)

merged_data_ohe.drop(columns=encoding_data, inplace=True)

Далее будем использовать LabelEncoder для кодировки оставшихся категориальных переменных 

In [131]:
encoding_data = ['event_timestamp', 'region', 'ua_os', 'ua_client_name', 'rutube_video_id']

le = LabelEncoder()
encoded_data = {}

for col in encoding_data:
    encoded_data[col] = le.fit_transform(all_train_data[col])

encoded_df = pd.DataFrame(encoded_data)

encoded_df

Unnamed: 0,event_timestamp,region,ua_os,ua_client_name,rutube_video_id
0,5901,13,20,57,9640
1,5442,13,20,57,114914
2,42207,13,20,57,125297
3,108470,13,20,57,87989
4,196439,13,20,57,23341
...,...,...,...,...,...
1759611,1219801,54,0,11,92058
1759612,1219182,95,11,47,15378
1759613,1213996,86,20,9,23050
1759614,1201551,86,0,22,62853


In [132]:
encoded_df.columns = [f"{col}_encoded" for col in encoded_df.columns]

merged_data = pd.concat([encoded_df.reset_index(drop=True), merged_data_ohe.reset_index(drop=True)], axis=1)

print("Колонки после объединения:")
print(merged_data.columns)

merged_data.drop(columns=[col for col in encoding_data if col in merged_data.columns], inplace=True)

print("Колонки после удаления исходных данных:")
print(merged_data.columns)

Колонки после объединения:
Index(['event_timestamp_encoded', 'region_encoded', 'ua_os_encoded',
       'ua_client_name_encoded', 'rutube_video_id_encoded', 'event_timestamp',
       'region', 'ua_os', 'ua_client_name', 'total_watchtime',
       'rutube_video_id', 'viewer_uid', 'age', 'sex', 'age_class',
       'ua_client_type_av', 'ua_client_type_browser',
       'ua_client_type_mobile app', 'ua_device_type_desktop',
       'ua_device_type_smartphone', 'ua_device_type_tablet'],
      dtype='object')
Колонки после удаления исходных данных:
Index(['event_timestamp_encoded', 'region_encoded', 'ua_os_encoded',
       'ua_client_name_encoded', 'rutube_video_id_encoded', 'total_watchtime',
       'viewer_uid', 'age', 'sex', 'age_class', 'ua_client_type_av',
       'ua_client_type_browser', 'ua_client_type_mobile app',
       'ua_device_type_desktop', 'ua_device_type_smartphone',
       'ua_device_type_tablet'],
      dtype='object')


In [None]:
merged_data.shape

In [136]:
merged_data.isna().sum()

event_timestamp_encoded      0
region_encoded               0
ua_os_encoded                0
ua_client_name_encoded       0
rutube_video_id_encoded      0
total_watchtime              0
viewer_uid                   0
age                          0
sex                          0
age_class                    0
ua_client_type_av            0
ua_client_type_browser       0
ua_client_type_mobile app    0
ua_device_type_desktop       0
ua_device_type_smartphone    0
ua_device_type_tablet        0
dtype: int64

Пропусков нет

## Sampling

Делим данные на 3 выборки - трейн 60%, тестовая и валидационная - 20%

In [137]:
X = merged_data.drop(columns=['sex', 'age', 'age_class'])
y = merged_data['sex']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=RANDOM_STATE)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE)

## Modeling

Попробуем использовать несколько моделей и сравним их результаты

In [138]:
lr_model = LogisticRegression(random_state=RANDOM_STATE).fit(X_train, y_train)

In [71]:
dec_tree = DecisionTreeClassifier(max_depth=3, random_state=RANDOM_STATE).fit(X_train, y_train)

In [140]:
rand_forest = RandomForestClassifier(max_depth=2, n_estimators=70, random_state=RANDOM_STATE).fit(X_train, y_train)

In [141]:
cat_boost = CatBoostClassifier(
    iterations=300,
    learning_rate=0.1,
    depth = 10,
    #loss_function='CrossEntropy'
    ).fit(X_train, y_train,
        eval_set=(X_test, y_test),
        verbose=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6682615	test: 0.6684379	best: 0.6684379 (0)	total: 1.4s	remaining: 6m 57s
1:	learn: 0.6483180	test: 0.6486745	best: 0.6486745 (1)	total: 3.65s	remaining: 9m 4s
2:	learn: 0.6318130	test: 0.6321912	best: 0.6321912 (2)	total: 4.76s	remaining: 7m 50s
3:	learn: 0.6193437	test: 0.6198258	best: 0.6198258 (3)	total: 5.66s	remaining: 6m 58s
4:	learn: 0.6088519	test: 0.6093192	best: 0.6093192 (4)	total: 6.86s	remaining: 6m 44s
5:	learn: 0.6004099	test: 0.6009708	best: 0.6009708 (5)	total: 12.8s	remaining: 10m 24s
6:	learn: 0.5939511	test: 0.5945835	best: 0.5945835 (6)	total: 15.6s	remaining: 10m 51s
7:	learn: 0.5885972	test: 0.5892844	best: 0.5892844 (7)	total: 17.8s	remaining: 10m 50s
8:	learn: 0.5840254	test: 0.5847914	best: 0.5847914 (8)	total: 18.7s	remaining: 10m 4s
9:	learn: 0.5799763	test: 0.5807551	best: 0.5807551 (9)	total: 19.7s	remaining: 9m 31s
10:	learn: 0.5765921	test: 0.5774470	best: 0.5774470 (10)	total: 20.7s	remaining: 9m 2s
11:	learn: 0.5742180	test: 0.5750935	best

In [142]:
mlp = MLPClassifier(random_state=RANDOM_STATE, max_iter=30).fit(X_train, y_train)

## Predicting

Считаем метрики

In [143]:
metrics(X_test, y_test)

Logistic Regression - balanced accuracy: 0.5195753990495173
Logistic Regression -  accuracy: 0.6117616637730413
Logistic Regression -  f1: 0.19504406843568836
Logistic Regression -  recall: 0.12251680137371584
Logistic Regression -  precision: 0.47802356474529284


Decision Tree - balanced accuracy: 0.6749943721019361
Decision Tree -  accuracy: 0.7033640881670138
Decision Tree -  f1: 0.5886298168807065
Decision Tree -  recall: 0.5528022026822986
Decision Tree -  precision: 0.6294233151583081


Random Forest - balanced accuracy: 0.5757673389918464
Random Forest -  accuracy: 0.6567374113087238
Random Forest -  f1: 0.3367847770469843
Random Forest -  recall: 0.2270183852917666
Random Forest -  precision: 0.6520685403290957


CatBoost - balanced accuracy: 0.7013076483733627
CatBoost -  accuracy: 0.7343310894712763
CatBoost -  f1: 0.6177089935191054
CatBoost -  recall: 0.5590712615093111
CatBoost -  precision: 0.6900883451949167


MLP - balanced accuracy: 0.5
MLP -  accuracy: 0.616086473461

По метрикам можем определить, что с задачей лучше всего справился catboost, поэтому в дальнейшем будем использовать его

Считаем метрики на трейне, чтобы оценить, есть ли переобучение

In [144]:
metrics(X_train, y_train)

Logistic Regression - balanced accuracy: 0.5209715617377889
Logistic Regression -  accuracy: 0.613088658598614
Logistic Regression -  f1: 0.19763940531398974
Logistic Regression -  recall: 0.12411373762193047
Logistic Regression -  precision: 0.4848922943472604


Decision Tree - balanced accuracy: 0.6759733226761732
Decision Tree -  accuracy: 0.7041530865179788
Decision Tree -  f1: 0.5900648345014043
Decision Tree -  recall: 0.5545695860901829
Decision Tree -  precision: 0.6304145423544391


Random Forest - balanced accuracy: 0.5778065746631325
Random Forest -  accuracy: 0.6586952259443117
Random Forest -  f1: 0.34034412625651483
Random Forest -  recall: 0.22932301149119041
Random Forest -  precision: 0.659742084755747


CatBoost - balanced accuracy: 0.7065020364024361
CatBoost -  accuracy: 0.7392232581180164
CatBoost -  f1: 0.6248034210863421
CatBoost -  recall: 0.5655328428978128
CatBoost -  precision: 0.6979521872564303


MLP - balanced accuracy: 0.5
MLP -  accuracy: 0.6160580581547

Так как метрики и тестовой и трейн выборки находятся в одном диапазоне, значит переобучения не произошло

Выгрузили для проверки собственной модели

In [42]:
X_val.to_csv('val_data.csv', index=False)

Оптимизация гиперпарамтеров

In [None]:
# MODEL_TYPE = 'CatBoostClassifier'

# params_cat = lambda trial: {
#             "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
#             "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#             "depth": trial.suggest_int("depth", 4, 12),
#             "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#             "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"])
#             }

# params_cat_opt = lambda trial: optuna_optimize(trial)

# study_cat_boost = optuna.create_study(study_name='Cat Boost optimize', direction='maximize')
# study_cat_boost.optimize(
#     params_cat_opt,
#     n_trials=N_TRIALS,
#     timeout=60, 
# )

In [None]:
# study_cat_boost.best_trial.value

In [None]:
# cat_boost = CatBoostClassifier(**study_cat_boost.best_params, random_state=RANDOM_STATE, verbose=100)
# cat_boost = cat_boost.fit(X_train, y_train)

## Age predicting

In [155]:
X_age = merged_data.drop(columns=['age_class', 'age', 'sex'])
y_age = merged_data['age_class']

X_train_age, X_temp_age, y_train_age, y_temp_age = train_test_split(X_age, y_age, test_size=0.4, random_state=RANDOM_STATE)
X_test_age, X_val_age, y_test_age, y_val_age = train_test_split(X_temp_age, y_temp_age, test_size=0.5, random_state=RANDOM_STATE)

In [156]:
cat_boost_age = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=8,
    #loss_function='CrossEntropy'
    ).fit(X_train_age, y_train_age,
        eval_set=(X_test_age, y_test_age),
        verbose=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.3446844	test: 1.3446665	best: 1.3446665 (0)	total: 9.61s	remaining: 15m 51s
1:	learn: 1.3106564	test: 1.3106323	best: 1.3106323 (1)	total: 13s	remaining: 10m 38s
2:	learn: 1.2822647	test: 1.2822601	best: 1.2822601 (2)	total: 15.6s	remaining: 8m 24s
3:	learn: 1.2583107	test: 1.2583322	best: 1.2583322 (3)	total: 18.2s	remaining: 7m 17s
4:	learn: 1.2383662	test: 1.2383871	best: 1.2383871 (4)	total: 20.8s	remaining: 6m 35s
5:	learn: 1.2211925	test: 1.2212260	best: 1.2212260 (5)	total: 23.3s	remaining: 6m 5s
6:	learn: 1.2065755	test: 1.2066276	best: 1.2066276 (6)	total: 25.9s	remaining: 5m 43s
7:	learn: 1.1939600	test: 1.1940012	best: 1.1940012 (7)	total: 28.3s	remaining: 5m 26s
8:	learn: 1.1825611	test: 1.1826267	best: 1.1826267 (8)	total: 30.8s	remaining: 5m 11s
9:	learn: 1.1731872	test: 1.1732830	best: 1.1732830 (9)	total: 33.4s	remaining: 5m
10:	learn: 1.1648772	test: 1.1650017	best: 1.1650017 (10)	total: 35.9s	remaining: 4m 50s
11:	learn: 1.1577076	test: 1.1578331	best: 1.1

In [157]:
lr_model_age = LogisticRegression(random_state=RANDOM_STATE).fit(X_train_age, y_train_age)

In [158]:
dec_tree_age = DecisionTreeClassifier(max_depth=3, random_state=RANDOM_STATE).fit(X_train_age, y_train_age)

In [159]:
rand_forest_age = RandomForestClassifier(max_depth=2, n_estimators=70, random_state=RANDOM_STATE).fit(X_train_age, y_train_age)

In [160]:
mlp_age = MLPClassifier(random_state=RANDOM_STATE, max_iter=30).fit(X_train_age, y_train_age)

In [161]:
metrics_age(X_test_age, y_test_age)

Logistic Regression - balanced accuracy: 0.25030109713219245
Logistic Regression -  accuracy: 0.38374871775928254
Logistic Regression -  f1: 0.22275827227584943
Logistic Regression -  recall: 0.38374871775928254
Logistic Regression -  precision: 0.2965607810457074


Decision Tree - balanced accuracy: 0.3276493911054477
Decision Tree -  accuracy: 0.4275906945553431
Decision Tree -  f1: 0.34401512167093795
Decision Tree -  recall: 0.4275906945553431
Decision Tree -  precision: 0.42840884995000117


Random Forest - balanced accuracy: 0.27231845434845275
Random Forest -  accuracy: 0.4137893800632525
Random Forest -  f1: 0.34096310874910174
Random Forest -  recall: 0.4137893800632525
Random Forest -  precision: 0.32170537536166405


CatBoost - balanced accuracy: 0.3433115298092019
CatBoost -  accuracy: 0.46413277904541617
CatBoost -  f1: 0.4504533817777557
CatBoost -  recall: 0.46413277904541617
CatBoost -  precision: 0.4673721526118814


MLP - balanced accuracy: 0.25045804857439014
MLP -  

In [None]:
metrics_age(X_train_age, y_train_age)

## Saving models

In [163]:
with open('catboost_model_age_new.pkl', 'wb') as file1:
    pickle.dump(cat_boost_age, file1)

with open('catboost_model_new.pkl', 'wb') as file2:
    pickle.dump(cat_boost, file2)

Если будет нужно - можно подгрузить модель:

In [94]:
# with open('catboost_model_age.pkl', 'rb') as file:
#     loaded_model = pickle.load(file)

## Final score

Получаем предсказание и для пола, и для возраста

In [164]:
y_pred_age = cat_boost_age.predict(X_test_age)

In [165]:
y_pred = cat_boost.predict(X_test)

In [166]:
f1_weighted = f1_score(y_test_age, y_pred_age, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

final_score = 0.7 * f1_weighted + 0.3 * accuracy
print(f'Weighted F1 = {f1_weighted:.4f} \nAccuracy = {accuracy:.4f} \nFinal Score = {final_score:.4f} \n')

Weighted F1 = 0.4505 
Accuracy = 0.7343 
Final Score = 0.5356 



Дальнейшая агрегация происходит при формировании итогового файла submission_csv в файле main.py