In [8]:
import json
# import ast

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, metrics
# from transformers import AutoTokenizer
# import torch
import shap

from pandarallel import pandarallel
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker  # to manipulate x-tickers
import plotly.express as px
from Levenshtein import distance as lev_distance

import os
from tqdm import tqdm
from collections import Counter
import pickle
import warnings

from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize': (20, 10), 'figure.facecolor': 'white'})
sns.set_palette("viridis")
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
os.environ["TOKENIZERS_PARALLELISM"] = "true"  # activate parallelism
pandarallel.initialize(progress_bar=True)
InteractiveShell.ast_node_interactivity = "all"  # show all outputs, not only the last

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
data = pd.read_csv("../data/raw/train_dataset_Самолет.csv")

# Feature Engineering 

In [3]:
def create_features(data):
    data['day'] = pd.to_datetime(data['report_date']).dt.day
    data['week'] = pd.to_datetime(data['report_date']).dt.week
    data['weekday'] = pd.to_datetime(data['report_date']).dt.weekday
    data['month'] = pd.to_datetime(data['report_date']).dt.month
    data['year'] = pd.to_datetime(data['report_date']).dt.year
    return data

In [4]:
data = create_features(data)

In [5]:
categorical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns

In [13]:
X = data.drop(columns=['target'])
Y = data['target']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42, shuffle=True)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.33, random_state=42, shuffle=True)

# Model

In [15]:
def catboost_model_classifier(x_train, x_test, y_train, y_test):

    # categorical_features_indices = np.where(x_train.dtypes != np.float)[0]
    categorical_features_indices = np.where(X_train.dtypes not in ['float64', 'int64'])[0]  # X_train.select_dtypes('category').columns.to_list()

    cb_model = CatBoostClassifier(
        # custom_loss=metrics.Accuracy(),
        loss_function='Logloss',
        random_seed=42,
        logging_level='Silent',
        # custom_metric=['MAE', 'MAPE'],
        max_depth=8,
        iterations=200,
        # scale_pos_weight=26,
        auto_class_weights='Balanced',
        # eval_metric=[metrics.Precision(), metrics.Recall(), metrics.F1(), metrics.TotalF1(), metrics.Accuracy()]
    )

    cb_model.fit(
        x_train, y_train,
        eval_set=(x_test, y_test),
        cat_features=categorical_features_indices,
        plot=True
    )

    return cb_model



In [16]:
catboost_model_classifier(X_train, X_test, y_train, y_test)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

# Feature Selection

# Hyperparameters Tuning

# Validation