In [4]:
import json
# import ast

from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from catboost import CatBoostClassifier, Pool, metrics, EFeaturesSelectionAlgorithm
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
# from transformers import AutoTokenizer
# import torch
import shap

from pandarallel import pandarallel
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker  # to manipulate x-tickers
import plotly.express as px
from Levenshtein import distance as lev_distance

import os
from tqdm import tqdm
import random
from collections import Counter
import pickle
import warnings

from IPython.core.interactiveshell import InteractiveShell

from samolet_parking_lot.modules.cv import *
from samolet_parking_lot.modules.feature_engineering import *
# from samolet_parking_lot.modules.features_selection import *
from samolet_parking_lot.modules.hyperparam_tuning import *
from samolet_parking_lot.modules.model import *
from samolet_parking_lot.modules.utils import *

warnings.filterwarnings('ignore')
sns.set(rc={'figure.figsize': (20, 10), 'figure.facecolor': 'white'})
sns.set_palette("viridis")
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
os.environ["TOKENIZERS_PARALLELISM"] = "true"  # activate parallelism
pandarallel.initialize(progress_bar=True)
InteractiveShell.ast_node_interactivity = "all"  # show all outputs, not only the last
seed_everything()

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# 1st Stage. Get Data

In [None]:
data = pd.read_csv("../data/raw/train_dataset_Самолет.csv")

In [None]:
data = create_date_features(data)

In [None]:
X = data.drop(columns=['target', 'client_id', 'report_date'])
Y = data['target']

In [None]:
categorical_columns = X.select_dtypes(exclude=['float64', 'int64']).columns
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns

In [None]:
X[numerical_columns] = X[numerical_columns].fillna(0)
X[categorical_columns] = X[categorical_columns].astype(str)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.33, random_state=42, shuffle=True)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, test_size=0.33, random_state=42, shuffle=True)