In [None]:
import json
# import ast

from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from catboost import CatBoostClassifier, Pool, metrics
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
# from transformers import AutoTokenizer
# import torch
import shap

from pandarallel import pandarallel
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker  # to manipulate x-tickers
import plotly.express as px
from Levenshtein import distance as lev_distance

import os
from tqdm import tqdm
import random
from collections import Counter
import pickle
import warnings

from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize': (20, 10), 'figure.facecolor': 'white'})
sns.set_palette("viridis")
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
os.environ["TOKENIZERS_PARALLELISM"] = "true"  # activate parallelism
pandarallel.initialize(progress_bar=True)
InteractiveShell.ast_node_interactivity = "all"  # show all outputs, not only the last

In [None]:
data = pd.read_csv("../data/raw/train_dataset_Самолет.csv")

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True


seed_everything()

In [None]:
def get_column_indices(df: pd.DataFrame, column_names: list) -> list:
    return [df.columns.get_loc(c) for c in column_names if c in df.columns]

In [None]:
def plot_roc_curve(model, X_test, y_test):
    # Predict probabilities for the test data.
    y_probs = model.predict_proba(X_test)

    # Keep only the positive class
    y_probs = y_probs[:, 1]

    # Compute the ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_probs)

    # Compute the AUC (Area Under the Curve)
    roc_auc = auc(fpr, tpr)

    # Plot the ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

# Feature Engineering 

In [None]:
def create_features(data):
    data['day'] = pd.to_datetime(data['report_date']).dt.day
    data['week'] = pd.to_datetime(data['report_date']).dt.week
    data['weekday'] = pd.to_datetime(data['report_date']).dt.weekday
    data['month'] = pd.to_datetime(data['report_date']).dt.month
    data['year'] = pd.to_datetime(data['report_date']).dt.year
    return data

In [None]:
data = create_features(data)

In [None]:
# data.head()

In [None]:
set(data.dtypes)

In [None]:
categorical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

In [None]:
X = data.drop(columns=['target'])
Y = data['target']

In [None]:
np.where(~X.dtypes.isin(['float64', 'int64']))[0]

In [None]:
# X.iloc[:, np.where(~X.dtypes.isin(['float64', 'int64']))[0]] = X.iloc[:, np.where(~X.dtypes.isin(['float64', 'int64']))[
#                                                                              0]].astype('str')
X[categorical_columns] = X[categorical_columns].astype(str)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42, shuffle=True)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.33, random_state=42, shuffle=True)

# Model

In [None]:
def catboost_model_classifier(x_train, x_test, y_train, y_test):
    # categorical_features_indices = np.where(~X_train.dtypes.isin(['float64', 'int64']))[0]
    categorical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns
    categorical_features_indices = get_column_indices(x_train, categorical_columns)

    cb_model = CatBoostClassifier(
        loss_function='Logloss',
        random_seed=42,
        logging_level='Silent',
        # custom_metric=['MAE', 'MAPE'],
        max_depth=8,
        iterations=200,
        # scale_pos_weight=26,
        auto_class_weights='Balanced',
        early_stopping_rounds=20,
        # eval_metric=[metrics.Precision(), metrics.Recall(), metrics.F1(), metrics.TotalF1(), metrics.Accuracy()]
    )

    cb_model.fit(
        x_train, y_train,
        eval_set=(x_test, y_test),
        cat_features=categorical_features_indices,
        plot=True
    )

    return cb_model


In [None]:
model = catboost_model_classifier(X_train, X_test, y_train, y_test)

In [None]:
y_pred = model.predict(X_valid)

In [None]:
roc_auc_score(y_valid, y_pred)

In [None]:
feat_importances = model.get_feature_importance(prettified=True)

plt.figure(figsize=(12, 10))
sns.barplot(x="Importance", y="Feature Name", data=feat_importances.loc[:30, :])
plt.title('CatBoost features importance:')

In [None]:
plot_roc_curve(model, X_valid, y_valid)

# Feature Selection

In [None]:
perm_raw = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, n_jobs=10)

In [None]:
perm = (
    pd.DataFrame(columns=['AVG_Importance', 'STD_Importance'], index=[i for i in X_train.columns])
    .assign(AVG_Importance=perm_raw.importances_mean)
    .assign(STD_Importance=np.std(perm_raw.importances, axis=1))
    .sort_values(by='AVG_Importance', ascending=False)
)

In [None]:
sns.barplot(x=perm.index, y=perm.AVG_Importance)
plt.xticks([])
plt.show()

In [None]:
perm

In [None]:
perm.query("AVG_Importance > 0")["AVG_Importance"]

Let's leave only useful features (22 out of 358 - 6.1%)

In [None]:
perm.query("AVG_Importance > 0")["AVG_Importance"].index.to_list()

In [None]:
useful_column_indices = get_column_indices(X_train, perm.query("AVG_Importance > 0")["AVG_Importance"].index.to_list())

In [None]:
model_without_noise_feat = catboost_model_classifier(X_train.iloc[:, useful_column_indices],
                                                     X_test.iloc[:, useful_column_indices], y_train, y_test)

In [None]:
y_pred = model_without_noise_feat.predict(X_valid.iloc[:, useful_column_indices])
roc_auc_score(y_valid, y_pred)

In [None]:
plot_roc_curve(model_without_noise_feat, X_valid.iloc[:, useful_column_indices], y_valid)

# Hyperparameters Tuning

In [None]:
def objective(search_space):  #x_train, x_test, y_train, y_test, 
    categorical_columns = X_train.select_dtypes(exclude=['float64', 'int64']).columns
    categorical_features_indices = get_column_indices(X_train, categorical_columns)

    cb_model = CatBoostClassifier(
        **search_space,
        loss_function='Logloss',
        auto_class_weights='Balanced',
        early_stopping_rounds=20,
        random_seed=42,
    )

    cb_model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        cat_features=categorical_features_indices,
        plot=True,
        verbose=False
    )
    return {'loss': cb_model.get_best_score()['validation']['Logloss'], 'status': STATUS_OK}

In [None]:
search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.1, 0.5),
    'iterations': hp.randint('iterations', 100, 1000),
    'l2_leaf_reg': hp.randint('l2_leaf_reg', 1, 10),
    'depth': hp.randint('depth', 4, 10),
    # 'border_count': hp.uniform ('border_count', 32, 255),
}

In [None]:
best_params = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=100,
    # verbose=False,
)

In [None]:
hyperparams = space_eval(search_space, best_params)

In [None]:
with open('hyperparams.pkl', 'wb') as fp:
    pickle.dump(hyperparams, fp)

In [None]:
params = {
    'learning_rate': hyperparams['learning_rate'],
    'iterations': hyperparams['iterations'],
    'depth': hyperparams['depth'],
    'loss_function': 'Logloss',
    'l2_leaf_reg': hyperparams['l2_leaf_reg'],
    'eval_metric': 'Logloss',
    'early_stopping_rounds': 20,
    'random_seed': 42,
}

In [None]:
categorical_columns = X_train.select_dtypes(exclude=['float64', 'int64']).columns
categorical_features_indices = get_column_indices(X_train, categorical_columns)
model = CatBoostClassifier(**params)
model.fit(X=X_train, y=y_train, eval_set=(X_valid, y_valid), verbose=250, cat_features=categorical_features_indices)
y_pred = model.predict(X_valid)
roc_auc_score(y_valid, y_pred)

# Cross-Validation

In [None]:
from catboost import cv

# parameters for training inside cv:
params = {
    'loss_function': 'Logloss',
    'iterations': 100,
    'custom_loss': 'AUC',
    'learning_rate': 0.5,
}

In [None]:
def print_cv_summary(cv_data):
    cv_data.head(10)

    best_value = cv_data['test-Logloss-mean'].min()
    best_iter = cv_data['test-Logloss-mean'].values.argmin()

    print('Best validation Logloss score : {:.4f}±{:.4f} on step {}'.format(
        best_value,
        cv_data['test-Logloss-std'][best_iter],
        best_iter)
    )


In [None]:
train_pool = Pool(data=X_train, label=y_train, cat_features=get_column_indices(X_train, categorical_columns))
val_pool = Pool(data=X_valid, label=y_valid, cat_features=get_column_indices(X_valid, categorical_columns))

In [None]:
cv_data = cv(
    params=params,
    pool=train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=False,
    verbose=False
)

print_cv_summary(cv_data)

In [None]:
cv_data

# Validation