In [493]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
import sys
import re 
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [508]:
sys.path.append('source')

In [495]:
import warnings
warnings.filterwarnings("ignore")

In [647]:
train = pd.read_csv('Data\\train_dataset_train.csv')
test = pd.read_csv('Data\\test_dataset_test.csv')

## Preprocessing

In [648]:
from preprocessing import preprocess
preprocess(train, test)

## Feature Engineering

In [649]:
from feature_generation import *

train, test = generate_place_of_study_nb(train, test)
train, test = generate_dev_from_group_average_birth(train, test)
train, test = generate_age(train, test)
train, test = generate_gap_year_dur(train, test)
train, test = generate_is_school_certificate(train, test)
train, test = generate_relative_rating(train, test, 'КодФакультета', 'СрБаллАттестата_отн_факультета', True, 0)
train, test = generate_relative_rating(train, test, 'Код_группы', 'СрБаллАттестата_отн_группы', False, 1e-4)
train, test = generate_group_freq(train, test)
train, test = generate_in_year_diff(train, test)
train.drop(columns = 'СрБаллАттестата', inplace = True)
test.drop(columns = 'СрБаллАттестата', inplace = True)
train['Код_группы_кат'] = train['Код_группы'].apply(lambda x : str(x))
test['Код_группы_кат'] = test['Код_группы'].apply(lambda x : str(x))

## Train and evaluate

In [650]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, precision_score
from sklearn.ensemble import RandomForestClassifier

In [651]:
drop_columns = [
               ]

In [652]:
X, Y = train.drop(columns = ['ID', 'Статус'] + drop_columns), train['Статус']
cat_cols = [col for col in X.columns if X[col].dtype == 'object']

In [668]:
catboost_config = {
                   'iterations' : 1500,
                   'eval_metric': 'TotalF1:average=Macro',
                   'use_best_model':True,
                   'learning_rate': 0.08,
                   'rsm': 0.7,
                   'task_type' : 'CPU', 
                   'max_ctr_complexity' : 0, 
                   'depth' : 6,
                   'random_seed': 1, 
}

In [669]:
from training_utils import cross_val_catboost

In [678]:
%%capture
n_splits = 10
average = 'macro'
res = cross_val_catboost(catboost_config, X, Y, cat_cols,None, n_splits,  average = average, verbose = False)
train_scores = res['train']
test_scores = res['test']
models = res['models']

### Inference on test data and submition

In [681]:
preds = []
for model in models:
    preds.append(model.predict(Pool(test.drop(columns = 'ID'), cat_features = cat_cols)))
preds = np.hstack(preds)
test_status = np.zeros(preds.shape[0])
for i, votes in enumerate(preds):
    labels, counts = np.unique(votes, return_counts = True)
    most_voted = labels[np.argmax(counts)]
    test_status[i] = most_voted

    

In [682]:
subm = pd.DataFrame({'ID' : test['ID'], 'Статус' : test_status})
subm.to_csv('submition.csv', index = False)