In [302]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
import re 
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [303]:
import warnings
warnings.filterwarnings("ignore")

In [479]:
train = pd.read_csv('Data\\train_dataset_train.csv')
test = pd.read_csv('Data\\test_dataset_test.csv')

## Preprocessing

In [480]:
from preprocessing import preprocess
preprocess(train, test)

## Feature Engineering

In [481]:
from feature_generation import *

train, test = generate_place_of_study_nb(train, test)
train, test = generate_dev_from_group_average_birth(train, test)
train, test = generate_age(train, test)
train, test = generate_gap_year_dur(train, test)
train, test = generate_is_school_certificate(train, test)
train, test = generate_relative_rating(train, test, 'КодФакультета', 'СрБаллАттестата_отн_факультета', True, 0)
train, test = generate_relative_rating(train, test, 'Код_группы', 'СрБаллАттестата_отн_группы', False, 1e-4)
train, test = generate_group_freq(train, test)
train, test = generate_in_year_diff(train, test)
train.drop(columns = 'СрБаллАттестата', inplace = True)
test.drop(columns = 'СрБаллАттестата', inplace = True)
train['Код_группы_кат'] = train['Код_группы'].apply(lambda x : str(x))
test['Код_группы_кат'] = test['Код_группы'].apply(lambda x : str(x))
train['Код_факультета_кат'] = train['КодФакультета'].apply(lambda x : str(x))
test['Код_факультета_кат'] = test['КодФакультета'].apply(lambda x : str(x))

## Train and evaluate

In [482]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, precision_score
from sklearn.ensemble import RandomForestClassifier

In [491]:
drop_columns = [
               ]

In [484]:
X, Y = train.drop(columns = ['ID', 'Статус'] + drop_columns), train['Статус']
cat_cols = [col for col in X.columns if X[col].dtype == 'object']

In [486]:
catboost_config = {
                   'iterations' : 1500,
                   'eval_metric': 'TotalF1:average=Macro',
                   'use_best_model':True,
                   'learning_rate': 0.08,
                   'rsm': 0.5,
                   'task_type' : 'CPU', 
                   'max_ctr_complexity' : 0, 
                   'depth' : 6,
                   'random_seed': 14121995
}

In [None]:
from training_utils import cross_val_catboost

In [487]:
%%capture
n_splits = 10
average = 'macro'
res = cross_val_catboost(catboost_config, X, Y, cat_cols,None, n_splits,  average = average, verbose = False)
train_scores = res['train']
test_scores = res['test']
models = res['models']

In [488]:
if average == 'macro':
    d = np.sqrt(n_splits)
    print(f'Macro F1 на обучающей выборке: {np.mean(train_scores):.3f} +- {3 * np.std(train_scores) / d:.3f}')
    print(f'Macro F1 на тестовой выборке: {np.mean(test_scores):.3f} +- {3 * np.std(test_scores) / d:.3f}')
elif average == None:
    print('F1 train score per class: %.3f/%.3f/%.3f' % tuple(np.array(train_scores).mean(axis = 0)), 
      'total: %.3f' % np.array(train_scores).mean())
    print('F1 test score per class: %.3f/%.3f/%.3f' % tuple(np.array(test_scores).mean(axis = 0)),
      'total: %.3f' % np.array(test_scores).mean())


Macro F1 на обучающей выборке: 0.909 +- 0.023
Macro F1 на тестовой выборке: 0.801 +- 0.020


### Inference on test data and submition

In [235]:
preds = []
for model in models:
    preds.append(model.predict(Pool(test.drop(columns = 'ID'), cat_features = cat_cols)))
preds = np.hstack(preds)
test_status = np.zeros(preds.shape[0])
for i, votes in enumerate(preds):
    labels, counts = np.unique(votes, return_counts = True)
    most_voted = labels[np.argmax(counts)]
    test_status[i] = most_voted

    

In [236]:
subm = pd.DataFrame({'ID' : test['ID'], 'Статус' : test_status})
subm.to_csv('submition.csv', index = False)