In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit

from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("../datasets/raw_data/dataset.csv")
target = df['target']
# df = pd.DataFrame(df.iloc[:,(df.count(0)/df.shape[0] > 0).values])
df['report_date'] = pd.to_datetime(df['report_date'])
df = df.sort_values('report_date')
df = df.drop(columns=['report_date','col1454'])

cols = []
for i in tqdm(range(3, df.shape[1])):
    ds = df.iloc[:,i]
    dtype = df.dtypes[i]

    if dtype!=np.object0:
        cols.append(ds)
    else:
        ds = ds.astype(str)
        cols.append(pd.Series)()
        dsohe = OneHotEncoder(sparse_output=False).fit_transform(np.array(ds.values.reshape((-1,1)), np.object0))
        for j in range(dsohe.shape[1]):
            cols.append(pd.Series(dsohe[:,j], name=ds.name+'_'+str(j)))
dfc = pd.DataFrame(cols).T.fillna(0)
dfc.shape

100%|██████████| 2661/2661 [00:02<00:00, 1207.05it/s]


(14456, 12203)

In [3]:
X = MinMaxScaler().fit_transform(dfc)

In [92]:
LAMBDA = 25
# Month cross validation
rocs = []
t = tqdm(KFold(5).split(X), total=5)
for tr, vl in t:
    X_tr, X_vl = X[tr], X[vl]
    y_tr, y_vl = target[tr], target[vl]

    X_tr = np.column_stack((y_tr, X_tr))
    X_vl = np.column_stack((np.zeros_like(y_vl), X_vl))
    
    gram = X_tr.T @ X_tr
    diag_indices = np.diag_indices(gram.shape[0])
    presicion = np.linalg.inv(gram + LAMBDA*np.eye(gram.shape[0]))
    B = presicion / (-np.diag(presicion))
    B[diag_indices] = 0

    pred_tr, pred_vl = X_tr@B[:,0], X_vl@B[:,0]
    # # заполнить таргет предсказанием, чтобы он в фичи не утекал
    # X_tr[:,0], X_vl[:,0] = pred_tr, pred_vl
    # # инпут значений для дальнейших моделей
    # X_tr, X_vl = X_tr@B, X_vl@B

    roc = roc_auc_score(y_vl, pred_vl)
    rocs.append(roc)
    t.set_postfix(roc_auc=roc)
print(f"${np.mean(rocs):.4f} \pm {np.std(rocs):.4f}$")

100%|██████████| 10/10 [00:06<00:00,  1.44it/s, roc_auc=0.496]

$0.4777 \pm 0.0199$





In [4]:
# client validation
LAMBDA = 25
m = df['client_id']<4000
X_tr, X_vl = X[m], X[~m]
y_tr, y_vl = target[m], target[~m]

X_tr = np.column_stack((y_tr, X_tr))
X_vl = np.column_stack((np.zeros_like(y_vl), X_vl))
gram = X_tr.T @ X_tr
diag_indices = np.diag_indices(gram.shape[0])
presicion = np.linalg.inv(gram + LAMBDA*np.eye(gram.shape[0]))
B = presicion / (-np.diag(presicion))
B[diag_indices] = 0
pred_tr, pred_vl = X_tr@B[:,0], X_vl@B[:,0]

print(f"{roc_auc_score(y_tr, pred_tr):.4f}")
print(f"{roc_auc_score(y_vl, pred_vl):.4f}")

0.7421
0.5239


# Scores

client split

min_not_nan = 0%
| model | 10kfold |
|---|---|
| ease25   | 0.5239 |
| knn      | 0.4999 |
| catboost | 0.5000 |
| linear   | 0.4996 |

min_not_nan = 90%
| model | 10kfold |
|---|---|
| ease25   | 0.5049 |
| knn      | 0.5000 |
| catboost | 0.5000 |
| linear   | 0.5000 |

min_not_nan = 10%
| $\lambda$ | 20random | 20kfold |
|---|---|---|
| 1e-3  | $0.9669 \pm 0.0312$ | $ $ |
| 1e-1  | $0.9731 \pm 0.0258$ | $0.7097 \pm 0.0723$ |
| 5e-1  | $0.9765 \pm 0.0229$ | $0.7193 \pm 0.0686$ |
| 1     | $0.9757 \pm 0.0224$ | $0.7269 \pm 0.0673$ |
| 2     | $0.9733 \pm 0.0216$ | $0.7364 \pm 0.0671$ |
| 10    | $0.9622 \pm 0.0204$ | $0.7628 \pm 0.0681$ |
| 25    | $ $ | $0.7721 \pm 0.0694$ |
| 50    | $ $ | $0.7713 \pm 0.0728$ |

Models

min_not_nan = 10%
| model | 10kfold |
|---|---|
| ease25   | $0.7664 \pm 0.0434$ |
| knn      | $0.5079 \pm 0.0140$ |
| catboost | $0.5164 \pm 0.0098$ |
| linear   | $0.5077 \pm 0.0169$ |

min_not_nan = 90%
| model | 10kfold |
|---|---|
| ease25   | $0.7429 \pm 0.0458$ |
| knn      | $0.5126 \pm 0.0090$ |
| catboost | $0.5060 \pm 0.0096$ |
| linear   | $0.5008 \pm 0.0026$ |