In [1]:
import multiprocessing 
import numpy as np
from multiprocessing import Pool
from functools import partial
import random
import time
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn import metrics
from scipy import optimize
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier, Pool

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [63]:
predicts_conv_ensemble_train = pd.read_csv("data/gb_net_predict_train_v3.csv")
predicts_conv_ensemble_test = pd.read_csv("data/gb_net_predict_test_v3.csv")

predicts_y_cut_unet_train = pd.read_csv("data/unet_preds_train.csv")
predicts_y_cut_unet_test = pd.read_csv("data/unet_preds_test.csv")
predicts_y_cut_unet_train = predicts_y_cut_unet_train[predicts_y_cut_unet_train["id"]!=132]

In [58]:
metrics_names  = ["metric_"+str(i) for i in range(16)]+["dummy_net_predict_continious"]+["gb_net_predict_continious"]

In [64]:
X_train = predicts_y_cut_unet_train["cut_prob_new_unet"].values
X_train = X_train.reshape((X_train.shape[0],1))
X_train = np.concatenate([X_train, predicts_conv_ensemble_train[metrics_names].values],axis=1)
Y_train = predicts_y_cut_unet_train["y"].values

X_test = predicts_y_cut_unet_test["cut_prob_new_unet"].values
X_test = X_test.reshape((X_test.shape[0],1))
X_test = np.concatenate([X_test, predicts_conv_ensemble_test[metrics_names].values],axis=1)
Y_test = predicts_y_cut_unet_test["y"].values

array([[-6.48270416e+00,  7.30153846e+02,  7.92000000e+02, ...,
         3.74991587e-01, -1.33071356e-02,  2.45406196e-01],
       [-6.45224929e+00,  1.40215385e+03,  1.58400000e+03, ...,
         8.76248389e-01, -2.41703466e-02,  9.96645615e-02],
       [-8.17804257e+00,  2.13846154e+03,  2.38800000e+03, ...,
         1.30859081e+00, -3.03453915e-02,  4.67551462e-02],
       ...,
       [-5.16201615e-01,  1.62461538e+03,  1.66800000e+03, ...,
         9.97543790e-01,  5.85495412e-01,  9.59705055e-01],
       [-4.90561090e-02,  1.11138462e+03,  1.11200000e+03, ...,
         5.63186347e-01,  3.09158653e-01,  6.40187085e-01],
       [ 0.00000000e+00,  5.55384615e+02,  5.56000000e+02, ...,
         2.81377141e-01,  0.00000000e+00,  1.87725589e-01]])

In [36]:
clf = CatBoostClassifier()

params = {'iterations': [500, 1000],
          'depth': [4, 5, 6, 10],
          'loss_function': ['Logloss', 'CrossEntropy', 'RMSE'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'logging_level':['Silent'],
          'random_seed': [42],
         }

scorer = make_scorer(f1_score)
clf_grid = GridSearchCV(estimator=clf, param_grid=params, scoring=scorer, cv=5, n_jobs=40, verbose=1)


In [37]:
clf_grid.fit(X_train, Y_train)
best_param = clf_grid.best_params_
best_param

Fitting 5 folds for each of 72 candidates, totalling 360 fits


{'depth': 6,
 'iterations': 500,
 'l2_leaf_reg': 1e-19,
 'leaf_estimation_iterations': 10,
 'logging_level': 'Silent',
 'loss_function': 'Logloss',
 'random_seed': 42}

In [38]:
model = CatBoostClassifier(iterations=1000,
                           loss_function=best_param['loss_function'],
                           depth=best_param['depth'],
                           l2_leaf_reg=best_param['l2_leaf_reg'],
                           eval_metric='F1',
                           leaf_estimation_iterations=10,
                           use_best_model=True,
                           logging_level='Silent',
                           random_seed=42
                          )

In [39]:
xtrain, xval, ytrain, yval = train_test_split(X_train, Y_train,train_size=0.8,random_state=42)
train_pool = Pool(xtrain, ytrain)
model.fit(train_pool, eval_set=(xval,yval))

In [61]:
y_train_pred = model.predict(X_train)
f1_score(y_train_pred, Y_train)

0.9437125748502994

In [65]:
y_test_pred = model.predict(X_test)
f1_score(y_test_pred, Y_test)

0.8942875902823374

In [44]:
real_test = pd.read_csv("data/test_final/gb_net_predict_test_final_v3.csv")
real_test_unet = pd.read_csv("data/test_final/test_FINAL_CLEAR_filtered_unet_new.csv")

In [45]:
X_test = real_test_unet["cut_prob_new_unet"].values
X_test = X_test.reshape((X_test.shape[0],1))
X_test = np.concatenate([X_test, real_test[metrics_names].values],axis=1)
y_test_pred = model.predict(X_test)
y_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [48]:
answer_submision = {}
answer_submision["id"] = real_test["original_id"]
answer_submision["time"] = real_test["time"]
answer_submision["y"] = y_test_pred
pd.DataFrame(answer_submision).to_csv("data/test_final/our_sub_andrey_getPredFilter_catBoost_v10.csv")
pd.DataFrame(answer_submision)

Unnamed: 0,id,time,y
0,81,0,0
1,81,568,0
2,81,1140,0
3,81,1716,0
4,81,2284,0
5,81,2860,0
6,81,3420,0
7,81,3992,0
8,81,4564,0
9,81,5132,0


In [68]:
model.save_model("data/test_final/catboost_weights")