In [38]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb

In [39]:
seed = 322

## Обработка датасета


In [40]:
df_path = 'datasets/FIIT/train.csv'
df = pd.read_csv(df_path).drop(columns=['ID'])

In [41]:
test_path = 'datasets/FIIT/data_predict.csv'
test_df = pd.read_csv(test_path)
test_df_id = test_df['ID']
test_df = test_df.drop(columns=['ID'])

In [42]:
df = df.dropna(axis=1, how='all')
test_df = test_df.dropna(axis=1, how='all')

In [43]:
X = df.drop(columns=['Target'])
y = df['Target']
y = (y + 1) / 2

In [44]:
is_cat = (X.dtypes != float)
cat_features_index = np.where(is_cat)[0]

In [45]:
n_fold = 5

In [46]:
predictions_df = pd.DataFrame(index=X.index, columns=['cat', 'xgb', 'raf', 'logreg'])

## Catboost


In [47]:
params_cat = {'max_depth': 8,
              'learning_rate': 0.024986069141434246,
              'l2_leaf_reg': 2.964950034617931,
              "loss_function": "Logloss",
              'eval_metric': 'AUC',
              'border_count': 131,
              'random_seed': seed,
              'verbose': False}

In [48]:
skf_meta = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
for meta_train_index, meta_val_index in skf_meta.split(X, y):

    X_train_cat, X_val_cat, y_train, y_val = train_test_split(X.iloc[meta_train_index], y.iloc[meta_train_index], 
                                                          test_size=0.2, 
                                                          random_state=seed, 
                                                          stratify=y.iloc[meta_train_index])
    
    train_pool = Pool(data=X_train_cat, label=y_train, cat_features=cat_features_index)
    val_pool = Pool(data=X_val_cat, label=y_val, cat_features=cat_features_index)
    
    cat_clf = CatBoostClassifier(**params_cat)
    cat_clf.fit(X=train_pool, use_best_model=True, eval_set=val_pool)

    predictions_df.loc[meta_val_index, f"cat"] = cat_clf.predict_proba(X.iloc[meta_val_index])[:, 1]

## XGB

In [49]:
def one_hot(df: pd.DataFrame) -> pd.DataFrame:
    categorical_cols = [col for col in df.columns if df[col].dtypes == "object"]
    df_oh = df[categorical_cols]
    df_yam = pd.get_dummies(df_oh)
    df = df.drop(columns=categorical_cols)
    crg = pd.concat((df, df_yam), axis=1)
    return crg

In [50]:
X_oh = one_hot(pd.concat((X, test_df)).drop(columns=['Var200', 'Var214', 'Var217', 'Var202', 'Var199']))

In [51]:
X_xgb_train = X_oh[0: len(X)]
test_df_xgb = X_oh[len(X): len(X_oh)]

In [52]:
params_xgb = {'lambda': 1.141828782138995e-08,
              'alpha': 0.002754275892545665,
              'max_depth': 4,
              'eta': 0.12083699704693514,
              'gamma': 0.0012569895580936102,
              'grow_policy': 'lossguide',
              'subsample': 0.5564856736534843,
              'colsample_bytree': 0.8746681692717658,
              'min_child_weight': 3.0298238923658305,
              'eval_metric':'auc',
              'n_estimators': 367}

In [53]:
skf_meta = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
for meta_train_index, meta_val_index in skf_meta.split(X_xgb_train, y):

    xgb_clf = xgb.XGBClassifier(**params_xgb)
    xgb_clf.fit(X_xgb_train.iloc[meta_train_index], y.iloc[meta_train_index])

    predictions_df.loc[meta_val_index, "xgb"] = xgb_clf.predict_proba(X_xgb_train.iloc[meta_val_index])[:, 1]

## Logreg


In [54]:
params_lor = {'C': 0.2520029924372651,
              'penalty': 'l1',
              'solver': 'liblinear'}

In [55]:
X_train_lor = X_xgb_train.fillna(0)

In [56]:
skf_meta = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
for meta_train_index, meta_val_index in skf_meta.split(X_train_lor, y):
    
    logreg = LogisticRegression(**params_lor)
    logreg.fit(X_train_lor.iloc[meta_train_index], y.iloc[meta_train_index])

    predictions_df.loc[meta_val_index, "logreg"] = logreg.predict_proba(X_train_lor.iloc[meta_val_index])[:, 1]

## Random forest


In [57]:
params_raf = {'n_estimators': 455,
              'max_depth': 9,
              'min_samples_split': 22,
              'min_samples_leaf': 12,
              'n_jobs': -1,
              'max_features': None}

In [58]:
skf_meta = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
for meta_train_index, meta_val_index in skf_meta.split(X_xgb_train, y):
    
    raf_clf = RandomForestClassifier(**params_raf)
    raf_clf.fit(X_xgb_train.iloc[meta_train_index], y.iloc[meta_train_index])

    predictions_df.loc[meta_val_index, "raf"] = raf_clf.predict_proba(X_xgb_train.iloc[meta_val_index])[:, 1]

## Meta

In [59]:
predictions_df

Unnamed: 0,cat,xgb,raf,logreg
0,0.045642,0.089037,0.061077,0.051845
1,0.009586,0.00426,0.052357,0.023024
2,0.095133,0.113758,0.132444,0.135575
3,0.048412,0.041068,0.045368,0.072969
4,0.024331,0.02455,0.038033,0.046978
...,...,...,...,...
37495,0.037087,0.009292,0.03505,0.020433
37496,0.11378,0.099824,0.093723,0.07281
37497,0.310546,0.388174,0.408968,0.564814
37498,0.035005,0.01331,0.048879,0.013081


In [60]:
predictions_df.to_csv('pred_df.csv', index=False)

In [61]:
predictions_df = pd.read_csv('pred_df.csv')

In [62]:
X_meta_pred = predictions_df.copy()

In [63]:
X_meta = pd.concat((X, X_meta_pred), axis=1)

In [64]:
X_meta.to_csv('X_meta.csv', index=False)

In [75]:
X_train_meta, X_val_meta, y_train_meta, y_val_meta = train_test_split(predictions_df, y, 
                                                                      test_size=0.2, 
                                                                      random_state=seed,
                                                                      stratify=y)

In [77]:
train_pool_meta = Pool(data=X_train_meta, label=y_train_meta)
val_pool_meta = Pool(data=X_val_meta, label=y_val_meta)

In [76]:
train_pool_meta = Pool(data=X_train_meta, label=y_train_meta, cat_features=cat_features_index)
val_pool_meta = Pool(data=X_val_meta, label=y_val_meta, cat_features=cat_features_index)

CatBoostError: Invalid cat_features[0] = 174 value: index must be < 4.

In [78]:
params_meta={'learning_rate': 0.098731893751284,
        'max_depth': 5,
        'eval_metric': 'AUC',
        'l2_leaf_reg': 1.6133519649344217,
        'random_seed': seed,
        'border_count': 116,
        'verbose':100}  

In [79]:
meta = CatBoostClassifier(**params_meta)

In [81]:
meta.fit(X=train_pool_meta, use_best_model=True, eval_set=val_pool_meta)

0:	test: 0.7352282	best: 0.7352282 (0)	total: 10.2ms	remaining: 10.1s
100:	test: 0.7544453	best: 0.7568859 (51)	total: 257ms	remaining: 2.28s
200:	test: 0.7506409	best: 0.7568859 (51)	total: 453ms	remaining: 1.8s
300:	test: 0.7454365	best: 0.7568859 (51)	total: 652ms	remaining: 1.51s
400:	test: 0.7412729	best: 0.7568859 (51)	total: 850ms	remaining: 1.27s
500:	test: 0.7351244	best: 0.7568859 (51)	total: 1.05s	remaining: 1.04s
600:	test: 0.7305575	best: 0.7568859 (51)	total: 1.25s	remaining: 828ms
700:	test: 0.7288456	best: 0.7568859 (51)	total: 1.45s	remaining: 618ms
800:	test: 0.7230778	best: 0.7568859 (51)	total: 1.65s	remaining: 409ms
900:	test: 0.7200885	best: 0.7568859 (51)	total: 1.85s	remaining: 203ms
999:	test: 0.7176974	best: 0.7568859 (51)	total: 2.04s	remaining: 0us

bestTest = 0.7568858567
bestIteration = 51

Shrink model to first 52 iterations.


<catboost.core.CatBoostClassifier at 0x7f29949b54c0>

In [82]:
y_meta_df = pd.read_csv('datasets/FIIT/meta_test.csv')

In [83]:
y_meta_df

Unnamed: 0,cat,xgb,raf,logreg
0,0.208511,0.378842,0.336292,0.158717
1,0.018226,0.002930,0.037318,0.017034
2,0.081012,0.088498,0.071502,0.123384
3,0.032852,0.017137,0.017257,0.047786
4,0.073203,0.040300,0.116466,0.069988
...,...,...,...,...
12495,0.033904,0.026300,0.055548,0.058058
12496,0.012203,0.039502,0.051092,0.040975
12497,0.056898,0.032016,0.026200,0.061954
12498,0.073912,0.069518,0.063414,0.076370


In [84]:
y_pred_meta = meta.predict_proba(y_meta_df)[:, 1]
y_pred_meta

array([0.27046025, 0.02138721, 0.0974634 , ..., 0.04400746, 0.07838631,
       0.01997238])

In [85]:
submission = pd.DataFrame({'ID': test_df_id.to_list(), 'Target': y_pred_meta})
submission.to_csv('submission.csv', index=False)