<img src='otus.png'>

# Применение ансамблей моделей 

https://statweb.stanford.edu/~jhf/ftp/trebst.pdf  
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3885826/  
Интерактив - http://arogozhnikov.github.io/2016/07/05/gradient_boosting_playground.html

http://xgboost.readthedocs.io/en/latest/  
http://xgboost.readthedocs.io/en/latest/model.html  
https://lightgbm.readthedocs.io/  
https://lightgbm.readthedocs.io/en/latest/    
https://tech.yandex.com/catboost/doc/dg/concepts/about-docpage/   
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/gbm.html#defining-a-gbm-model  

In [24]:
import time
import re
# from __future__ import print_function
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, MinMaxScaler,  Imputer, LabelBinarizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# Ансамбли


import xgboost as xgb
import lightgbm as lgb

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [25]:
df_train = pd.read_csv("train.csv")
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.46,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.86,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.07,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.13,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.07,,C


In [26]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S


In [27]:
# move target to the right
survived = df_train['Survived']
df_train.drop(labels=['Survived'], axis=1, inplace=True)
df_train['Survived'] = survived

In [28]:
class LabelEncoderPipelineFriendly(LabelEncoder):
    
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelEncoderPipelineFriendly, self).fit(X)
        
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)
    

class FeaturesSum(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return np.sum(X, axis=1).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    

class AgeFeature(BaseEstimator, TransformerMixin):
    # works with df only
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        # sex, name
        X['Initial'] = 0
        for i in X:
            X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    
        X['Initial'].replace(
            ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
            ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],
            inplace=True
        )
        X.groupby('Initial')['Age'].mean() # lets check the average age by Initials

        ## Assigning the NaN Values with the Ceil values of the mean ages
        X.loc[(X.Age.isnull()) & (X.Initial=='Mr'), 'Age'] = 33
        X.loc[(X.Age.isnull()) & (X.Initial=='Mrs'), 'Age'] = 36
        X.loc[(X.Age.isnull()) & (X.Initial=='Master'), 'Age'] = 5
        X.loc[(X.Age.isnull()) & (X.Initial=='Miss'), 'Age'] = 22
        X.loc[(X.Age.isnull()) & (X.Initial=='Other'), 'Age'] = 46
        return X['Age'].as_matrix().reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [29]:
import warnings
warnings.warn("deprecated", category=DeprecationWarning)
def get_sex_col(df):
    return df[['Sex']]

def get_age_name_cols(df):
    return df[['Age', 'Name']]

def get_pclass_col(df):
    return df[['Pclass']]

def get_sum_cols(df):
    return df[['Age', 'Fare']]

def get_num_cols(df):
    return df[['Fare', 'SibSp', 'Parch']]

vec = make_union(*[
    make_pipeline(FunctionTransformer(get_pclass_col, validate=False),  OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_sex_col, validate=False),  LabelEncoderPipelineFriendly()),
    make_pipeline(FunctionTransformer(get_num_cols, validate=False), Imputer(strategy='mean'), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_age_name_cols, validate=False),  AgeFeature()),
])

  


In [30]:
x_train = vec.fit_transform(df_train)
x_train.shape

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

(891, 8)

In [31]:
x_train

array([[ 0.        ,  0.        ,  1.        , ...,  0.125     ,
         0.        , 22.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.125     ,
         0.        , 38.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.        , 26.        ],
       ...,
       [ 0.        ,  0.        ,  1.        , ...,  0.125     ,
         0.33333333, 22.        ],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 26.        ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
         0.        , 32.        ]])

In [32]:
y_train = df_train['Survived']
y_train.shape

(891,)

In [33]:
lr = LogisticRegressionCV(cv=10)
lr.fit(x_train, y_train)
lr



LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

In [34]:
accuracy_score(y_train, lr.predict(x_train))

0.8035914702581369

# Применение модели

In [35]:
def apply_model(model, submission_name):
    x_test = vec.fit_transform(df_test) 
    print('shape of x_test is {}'.format(x_test.shape))
    y_test = model.predict(x_test)
    print('shape of y_test is {}'.format(y_test.shape))
    df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
    df_predicted.to_csv(submission_name + '.csv', sep=',', index=False)

In [36]:
apply_model(lr, 'linear_regression_cv')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

shape of x_test is (418, 8)
shape of y_test is (418,)


# Обучение ансамблей

In [59]:
def randomized_cv(model, param_grid, x_train=x_train, y_train=y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy')
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    print(grid_search.best_score_)
    return grid_search.best_estimator_

# XGBoost

In [38]:
import xgboost as xgb

param_grid = {
    'max_depth': [2, 3, 4],
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.025]
}
xgb = randomized_cv(xgb.XGBClassifier(), param_grid)

model XGBClassifier best accuracy score is 0.8237934904601572
time for training is 2.901031255722046 seconds
0.8237934904601572


https://www.kaggle.com/cbrogan/xgboost-example-python

In [39]:
apply_model(xgb, 'xgb_cv')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

shape of x_test is (418, 8)
shape of y_test is (418,)


Особенности XGBoost
* есть регуляризация
* распараллеливание
* возможность кастомизации
* обработка отсуствующих значений
* встроенная кросс-валидация
* возможность архивировать и восстанавливать модель

# LightGBM 

In [40]:
import lightgbm as lgb

param_grid = {
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.02, 0.05]
}
model = randomized_cv(lgb.LGBMClassifier(), param_grid)

model LGBMClassifier best accuracy score is 0.8294051627384961
time for training is 1.9023396968841553 seconds
0.8294051627384961


In [41]:
apply_model(model, 'lgb_cv')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try usi

shape of x_test is (418, 8)
shape of y_test is (418,)


Особенности

* использование гистограмм для всех признаков (уже тоже есть в xgboost)
* то же самое, но быстрее (см выше)

# H2O GBM 

In [42]:
import h2o
import numpy as np
import math
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init(nthreads=-1, strict_version_check=True)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_151"; Java(TM) SE Runtime Environment (build 1.8.0_151-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.151-b12, mixed mode)
  Starting server from /Users/nazhmeddinbabakhanov/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/j0/tdm6bt912n17kg911htf1_wh0000gn/T/tmpc58psycq
  JVM stdout: /var/folders/j0/tdm6bt912n17kg911htf1_wh0000gn/T/tmpc58psycq/h2o_nazhmeddinbabakhanov_started_from_python.out
  JVM stderr: /var/folders/j0/tdm6bt912n17kg911htf1_wh0000gn/T/tmpc58psycq/h2o_nazhmeddinbabakhanov_started_from_python.err
  Server is running at http://127.0.0.1:54331
Connecting to H2O server at http://127.0.0.1:54331...................... failed.


H2OConnectionError: Could not establish link to the H2O cloud http://127.0.0.1:54331 after 20 retries
[55:23.04] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a22665550>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:26.25] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a22665588>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:29.46] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2264f710>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:32.68] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2264ff60>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:35.89] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2260ac50>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:39.10] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a22665320>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:42.32] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a22665898>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:45.53] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2264fb38>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:48.75] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2264fe10>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:51.96] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1061deb70>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:55.18] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2260a2e8>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[55:58.40] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a21ce8128>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:01.62] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a147c60f0>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:04.84] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2264f390>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:08.05] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a22608f28>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:11.27] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a2260aac8>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:14.48] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a22665278>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:17.69] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a21ce8198>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:20.91] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x1a22665518>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))
[56:24.13] H2OConnectionError: Unexpected HTTP error: HTTPConnectionPool(host='192.168.49.1', port=8081): Max retries exceeded with url: http://127.0.0.1:54331/3/Cloud (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x106fdc908>, 'Connection to 192.168.49.1 timed out. (connect timeout=3.0)'))

In [None]:
data = np.c_[x_train, y_train]
data = pd.DataFrame(data, columns=['C{}'.format(idx + 1) for idx in range(data.shape[-1])])

train_df_h2o = h2o.H2OFrame(python_obj=data)
train_df_h2o['C9'] = train_df_h2o['C9'].asfactor()

train_df_h2o.show()

In [None]:
print(train_df_h2o['C9'].shape)

In [None]:
x_test = vec.fit_transform(df_test)
data_test = pd.DataFrame(x_test, columns=['C{}'.format(idx + 1) for idx in range(x_test.shape[-1])])

test_df_h2o = h2o.H2OFrame(python_obj=data_test)
test_df_h2o.show()

In [None]:
data.info()

In [None]:
gbm = H2OGradientBoostingEstimator()
gbm.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)
print(gbm)

In [None]:
['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)]

In [None]:
## Depth 10 is usually plenty of depth for most datasets, but you never know
hyper_params = {'max_depth': [3, 5, 7, 9, 12]}
# hyper_params = {'max_depth' : [4,6,8,12,16,20]} ##faster for larger datasets

#Build initial GBM Model
gbm_grid = H2OGradientBoostingEstimator(
        ## more trees is better if the learning rate is small enough 
        ## here, use "more than enough" trees - we have early stopping
        ntrees=10000,
        ## smaller learning rate is better
        ## since we have learning_rate_annealing, we can afford to start with a 
        #bigger learning rate
        learn_rate=0.05,
        ## learning rate annealing: learning_rate shrinks by 1% after every tree 
        ## (use 1.00 to disable, but then lower the learning_rate)
        learn_rate_annealing = 0.99,
        ## sample 80% of rows per tree
        sample_rate = 0.8,
        ## sample 80% of columns per split
        col_sample_rate = 0.8,
        ## fix a random number generator seed for reproducibility
        seed = 1234,
        score_each_iteration=True,
        ## score every 10 trees to make early stopping reproducible 
        #(it depends on the scoring interval)
        score_tree_interval = 10, 
        ## early stopping once the validation AUC doesn't improve by at least 0.01% for 
        #5 consecutive scoring events
         stopping_rounds = 5,
#          stopping_metric = "auc",
         stopping_tolerance = 1e-4
)

#Build grid search with previously made GBM and hyper parameters
grid = H2OGridSearch(gbm_grid, hyper_params,
#                          grid_id = 'depth_grid',
                         search_criteria = {'strategy': "RandomDiscrete"})

In [None]:
#Train grid search
grid.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)

In [None]:
print(grid)

In [None]:
best_model = h2o.get_model(grid.sorted_metric_table()['model_ids'][0])
best_model

In [None]:
# 

In [None]:
preds = best_model.predict(test_df_h2o)
preds.head()

In [None]:
pred_df = preds.as_data_frame()

submit = pd.DataFrame()
submit['PassengerId'] = df_test['PassengerId']
submit['Survived'] = pred_df['predict']
submit.to_csv('h2o.csv', sep=',', index=False)

# CatBoost

https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/  
https://arxiv.org/pdf/1706.09516.pdf  


In [None]:
from catboost import CatBoostClassifier
param_grid = {
    'iterations': [2, 3, 4, 5],
    'depth': [2, 3, 4, 5],
    'learning_rate': [1, 0.1, 0.01, 0.001]
}
cbm = randomized_cv(CatBoostClassifier(), param_grid)

Параметры модели

https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/

Настройка параметров

https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning-docpage/

Особенности

* уменьшено (?) переобучение
* умеет обрабатывать категориальные признаки
* большое количество визуализаций
* работает лучше по бенчмаркам (но дольше)

# Подбор гиперпараметров. Общий подход.

* выбрать относительно высокий learning_rate (например, 0.05 - 0.2)
* определить необходимое количество деревьев для исключения проблема недообучения и переобучения - поставить побольше и выбрать такое, где ошибка на валидации начинает расти
* зафиксировать параметры из предыдущих пунктов и настроить параметры, связанные с деревьями.
* зафиксировать параметры деревьев и дополнительно настроить learning_rate и количество деревьев

Основные параметры, связанные с бустингом

* learning_rate
* n_estimators
* subsample
* loss

Основные параметры, связанные с деревьями

* max_depth
* max_features
* min_samples_split
* min_samples_leaf
* max_leaf_nodes
* ...

# https://www.kaggle.com/c/home-credit-default-risk/data

In [43]:

train = "application_train.csv"
test = "application_test.csv"

In [44]:
df = pd.read_csv(train)

In [46]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
y = df['TARGET']
y

0         1
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        1
27        0
28        0
29        0
         ..
307481    1
307482    0
307483    0
307484    0
307485    0
307486    0
307487    0
307488    0
307489    1
307490    0
307491    0
307492    0
307493    0
307494    0
307495    0
307496    0
307497    0
307498    0
307499    0
307500    0
307501    0
307502    0
307503    0
307504    0
307505    0
307506    0
307507    0
307508    0
307509    1
307510    0
Name: TARGET, Length: 307511, dtype: int64

In [48]:
columns = list(df.columns.values)
columns.remove('SK_ID_CURR')
columns.remove('TARGET')

In [49]:
columns

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'OCCUPATION_TYPE',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'WEEKDAY_APPR_PROCESS_START',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'ORGANIZATION_TYPE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 

In [62]:
x = df[columns]

In [63]:
y.unique()

array([1, 0])

In [64]:
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.02,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.08,0.26,0.14,0.02,0.04,0.97,0.62,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,0.03,0.04,0.97,0.63,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,0.03,0.04,0.97,0.62,0.01,0.0,0.07,0.08,0.12,0.04,0.02,0.02,0.0,0.0,reg oper account,block of flats,0.01,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.0,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.31,0.62,,0.1,0.05,0.99,0.8,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.05,0.0,0.01,0.09,0.05,0.99,0.8,0.05,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.0,0.0,0.1,0.05,0.99,0.8,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.0,0.01,reg oper account,block of flats,0.07,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.01,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.56,0.73,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.01,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.65,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.03,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.32,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
categorical = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

In [67]:
x

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,Cash loans,M,N,Y,0,202500.00,406597.50,24700.50,351000.00,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.02,-9461,-637,-3648.00,-2120,,1,1,0,1,1,0,Laborers,1.00,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.08,0.26,0.14,0.02,0.04,0.97,0.62,0.01,0.00,0.07,0.08,0.12,0.04,0.02,0.02,0.00,0.00,0.03,0.04,0.97,0.63,0.01,0.00,0.07,0.08,0.12,0.04,0.02,0.02,0.00,0.00,0.03,0.04,0.97,0.62,0.01,0.00,0.07,0.08,0.12,0.04,0.02,0.02,0.00,0.00,reg oper account,block of flats,0.01,"Stone, brick",No,2.00,2.00,2.00,2.00,-1134.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00,1.00
1,Cash loans,F,N,N,0,270000.00,1293502.50,35698.50,1129500.00,Family,State servant,Higher education,Married,House / apartment,0.00,-16765,-1188,-1186.00,-291,,1,1,0,1,1,0,Core staff,2.00,1,1,MONDAY,11,0,0,0,0,0,0,School,0.31,0.62,,0.10,0.05,0.99,0.80,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.05,0.00,0.01,0.09,0.05,0.99,0.80,0.05,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.00,0.00,0.10,0.05,0.99,0.80,0.06,0.08,0.03,0.29,0.33,0.01,0.08,0.06,0.00,0.01,reg oper account,block of flats,0.07,Block,No,1.00,0.00,1.00,0.00,-828.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00,0.00
2,Revolving loans,M,Y,Y,0,67500.00,135000.00,6750.00,135000.00,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.01,-19046,-225,-4260.00,-2531,26.00,1,1,1,1,1,0,Laborers,1.00,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.56,0.73,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00,0.00,0.00,-815.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00,0.00
3,Cash loans,F,N,Y,0,135000.00,312682.50,29686.50,297000.00,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.01,-19005,-3039,-9833.00,-2437,,1,1,0,1,0,0,Laborers,2.00,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.65,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.00,0.00,2.00,0.00,-617.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,Cash loans,M,N,Y,0,121500.00,513000.00,21865.50,513000.00,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.03,-19932,-3038,-4311.00,-3458,,1,1,0,1,0,0,Core staff,1.00,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.32,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00,0.00,0.00,-1106.00,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00,0.00
5,Cash loans,M,N,Y,0,99000.00,490495.50,27517.50,454500.00,"Spouse, partner",State servant,Secondary / secondary special,Married,House / apartment,0.04,-16941,-1588,-4970.00,-477,,1,1,1,1,1,0,Laborers,2.00,2,2,WEDNESDAY,16,0,0,0,0,0,0,Other,,0.35,0.62,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.00,0.00,0.00,0.00,-2536.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,1.00,1.00
6,Cash loans,F,Y,Y,1,171000.00,1560726.00,41301.00,1395000.00,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.04,-13778,-3130,-1213.00,-619,17.00,1,1,0,1,1,0,Accountants,3.00,2,2,SUNDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.77,0.72,0.49,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.00,0.00,1.00,0.00,-1562.00,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.00,0.00,0.00,1.00,1.00,2.00
7,Cash loans,M,Y,Y,0,360000.00,1530000.00,42075.00,1530000.00,Unaccompanied,State servant,Higher education,Married,House / apartment,0.00,-18850,-449,-4597.00,-2379,8.00,1,1,1,1,0,0,Managers,2.00,3,3,MONDAY,16,0,0,0,0,1,1,Other,,0.71,0.54,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.00,0.00,2.00,0.00,-1070.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00,0.00
8,Cash loans,F,N,Y,0,112500.00,1019610.00,33826.50,913500.00,Children,Pensioner,Secondary / secondary special,Married,House / apartment,0.02,-20099,365243,-7427.00,-3514,,1,0,0,1,0,0,,2.00,2,2,WEDNESDAY,14,0,0,0,0,0,0,XNA,0.59,0.21,0.75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.00,0.00,1.00,0.00,0.00,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00,1.00
9,Revolving loans,M,N,Y,0,135000.00,405000.00,20250.00,405000.00,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.02,-14469,-2019,-14437.00,-3992,,1,1,0,1,0,0,Laborers,1.00,2,2,THURSDAY,8,0,0,0,0,0,0,Electricity,,0.75,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.00,0.00,2.00,0.00,-1673.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


### baseline

In [68]:
from sklearn.linear_model import SGDClassifier
param_grid = {
    "penalty": ["none", "l2", "l1", "elasticnet"],
    'l1_ratio': [0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9],
    'alpha': [1, 0.1, 0.01, 0.001, 0.0001]
}
sgd = randomized_cv(SGDClassifier(), param_grid, x, y)



ValueError: could not convert string to float: 'No'

### CatBoost

In [69]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(eval_metric='AUC')
#clf.fit(x, y, cat_features=[0,1,2,3, 9, 10, 11, 12, 13, 26, 30, 38, 84, 85, 87, 88])

In [70]:
dfcb = pd.DataFrame(df)


In [71]:
for c in categorical:
    dfcb[c] = dfcb[c].astype(str)

In [93]:
x = dfcb[columns].value

AttributeError: 'DataFrame' object has no attribute 'value'

In [73]:
clf.fit(x, y, cat_features=[0,1,2,3, 9, 10, 11, 12, 13, 26, 30, 38, 84, 85, 87, 88])

Learning rate set to 0.08476
0:	total: 820ms	remaining: 13m 39s
1:	total: 1.53s	remaining: 12m 42s
2:	total: 2.34s	remaining: 12m 58s
3:	total: 3.2s	remaining: 13m 17s
4:	total: 3.91s	remaining: 12m 57s
5:	total: 4.68s	remaining: 12m 54s
6:	total: 5.36s	remaining: 12m 40s
7:	total: 5.83s	remaining: 12m 3s
8:	total: 6.6s	remaining: 12m 6s
9:	total: 7.44s	remaining: 12m 16s
10:	total: 8.21s	remaining: 12m 17s
11:	total: 9.02s	remaining: 12m 22s
12:	total: 9.78s	remaining: 12m 22s
13:	total: 10.5s	remaining: 12m 21s
14:	total: 11.3s	remaining: 12m 24s
15:	total: 12.2s	remaining: 12m 31s
16:	total: 13s	remaining: 12m 31s
17:	total: 13.8s	remaining: 12m 31s
18:	total: 14.5s	remaining: 12m 29s
19:	total: 15.3s	remaining: 12m 29s
20:	total: 16.1s	remaining: 12m 28s
21:	total: 16.9s	remaining: 12m 29s
22:	total: 17.6s	remaining: 12m 28s
23:	total: 18.4s	remaining: 12m 27s
24:	total: 19.1s	remaining: 12m 26s
25:	total: 19.9s	remaining: 12m 24s
26:	total: 20.6s	remaining: 12m 23s
27:	total: 21.3

222:	total: 4m 56s	remaining: 17m 12s
223:	total: 4m 57s	remaining: 17m 9s
224:	total: 4m 57s	remaining: 17m 6s
225:	total: 4m 58s	remaining: 17m 3s
226:	total: 4m 59s	remaining: 17m
227:	total: 5m	remaining: 16m 57s
228:	total: 5m 1s	remaining: 16m 54s
229:	total: 5m 2s	remaining: 16m 51s
230:	total: 5m 3s	remaining: 16m 48s
231:	total: 5m 3s	remaining: 16m 46s
232:	total: 5m 4s	remaining: 16m 43s
233:	total: 5m 5s	remaining: 16m 40s
234:	total: 5m 6s	remaining: 16m 37s
235:	total: 5m 7s	remaining: 16m 34s
236:	total: 5m 8s	remaining: 16m 32s
237:	total: 5m 8s	remaining: 16m 29s
238:	total: 5m 9s	remaining: 16m 26s
239:	total: 5m 10s	remaining: 16m 23s
240:	total: 5m 11s	remaining: 16m 20s
241:	total: 5m 12s	remaining: 16m 17s
242:	total: 5m 12s	remaining: 16m 14s
243:	total: 5m 13s	remaining: 16m 12s
244:	total: 5m 14s	remaining: 16m 10s
245:	total: 5m 15s	remaining: 16m 7s
246:	total: 5m 16s	remaining: 16m 5s
247:	total: 5m 17s	remaining: 16m 3s
248:	total: 5m 18s	remaining: 16m
249

441:	total: 9m	remaining: 11m 21s
442:	total: 9m 1s	remaining: 11m 20s
443:	total: 9m 2s	remaining: 11m 19s
444:	total: 9m 3s	remaining: 11m 18s
445:	total: 9m 5s	remaining: 11m 17s
446:	total: 9m 6s	remaining: 11m 15s
447:	total: 9m 7s	remaining: 11m 14s
448:	total: 9m 8s	remaining: 11m 13s
449:	total: 9m 9s	remaining: 11m 12s
450:	total: 9m 11s	remaining: 11m 10s
451:	total: 9m 12s	remaining: 11m 10s
452:	total: 9m 13s	remaining: 11m 8s
453:	total: 9m 14s	remaining: 11m 7s
454:	total: 9m 15s	remaining: 11m 5s
455:	total: 9m 16s	remaining: 11m 3s
456:	total: 9m 17s	remaining: 11m 2s
457:	total: 9m 21s	remaining: 11m 3s
458:	total: 9m 22s	remaining: 11m 2s
459:	total: 9m 23s	remaining: 11m
460:	total: 9m 24s	remaining: 10m 59s
461:	total: 9m 25s	remaining: 10m 58s
462:	total: 9m 26s	remaining: 10m 57s
463:	total: 9m 28s	remaining: 10m 56s
464:	total: 9m 30s	remaining: 10m 56s
465:	total: 9m 31s	remaining: 10m 55s
466:	total: 9m 33s	remaining: 10m 54s
467:	total: 9m 34s	remaining: 10m 5

658:	total: 15m 13s	remaining: 7m 52s
659:	total: 15m 15s	remaining: 7m 51s
660:	total: 15m 16s	remaining: 7m 49s
661:	total: 15m 17s	remaining: 7m 48s
662:	total: 15m 18s	remaining: 7m 47s
663:	total: 15m 20s	remaining: 7m 45s
664:	total: 15m 21s	remaining: 7m 44s
665:	total: 15m 22s	remaining: 7m 42s
666:	total: 15m 24s	remaining: 7m 41s
667:	total: 15m 25s	remaining: 7m 39s
668:	total: 15m 26s	remaining: 7m 38s
669:	total: 15m 28s	remaining: 7m 37s
670:	total: 15m 29s	remaining: 7m 35s
671:	total: 15m 30s	remaining: 7m 34s
672:	total: 15m 31s	remaining: 7m 32s
673:	total: 15m 32s	remaining: 7m 31s
674:	total: 15m 34s	remaining: 7m 29s
675:	total: 15m 35s	remaining: 7m 28s
676:	total: 15m 36s	remaining: 7m 26s
677:	total: 15m 38s	remaining: 7m 25s
678:	total: 15m 39s	remaining: 7m 24s
679:	total: 15m 40s	remaining: 7m 22s
680:	total: 15m 41s	remaining: 7m 21s
681:	total: 15m 43s	remaining: 7m 19s
682:	total: 15m 44s	remaining: 7m 18s
683:	total: 15m 45s	remaining: 7m 16s
684:	total: 

876:	total: 20m 50s	remaining: 2m 55s
877:	total: 20m 52s	remaining: 2m 54s
878:	total: 20m 54s	remaining: 2m 52s
879:	total: 20m 55s	remaining: 2m 51s
880:	total: 20m 57s	remaining: 2m 49s
881:	total: 20m 59s	remaining: 2m 48s
882:	total: 21m	remaining: 2m 47s
883:	total: 21m 2s	remaining: 2m 45s
884:	total: 21m 3s	remaining: 2m 44s
885:	total: 21m 5s	remaining: 2m 42s
886:	total: 21m 7s	remaining: 2m 41s
887:	total: 21m 9s	remaining: 2m 40s
888:	total: 21m 10s	remaining: 2m 38s
889:	total: 21m 11s	remaining: 2m 37s
890:	total: 21m 13s	remaining: 2m 35s
891:	total: 21m 15s	remaining: 2m 34s
892:	total: 21m 17s	remaining: 2m 33s
893:	total: 21m 18s	remaining: 2m 31s
894:	total: 21m 19s	remaining: 2m 30s
895:	total: 21m 23s	remaining: 2m 28s
896:	total: 21m 25s	remaining: 2m 27s
897:	total: 21m 26s	remaining: 2m 26s
898:	total: 21m 28s	remaining: 2m 24s
899:	total: 21m 29s	remaining: 2m 23s
900:	total: 21m 30s	remaining: 2m 21s
901:	total: 21m 32s	remaining: 2m 20s
902:	total: 21m 33s	r

<catboost.core.CatBoostClassifier at 0x1a31e8f6d8>

In [82]:
df1 = pd.read_csv(test)

In [83]:
dfcb = pd.DataFrame(df1)

In [84]:
for c in categorical:
    dfcb[c] = dfcb[c].astype(str)

In [85]:
x_test = dfcb[columns].values

In [87]:
df1['TARGET'] = clf.predict(x_test)

In [91]:
submit = df1[['SK_ID_CURR','TARGET']]
submit["TARGET"]

0       0.00
1       0.00
2       0.00
3       0.00
4       0.00
5       0.00
6       0.00
7       0.00
8       0.00
9       0.00
10      0.00
11      0.00
12      0.00
13      0.00
14      0.00
15      0.00
16      0.00
17      0.00
18      0.00
19      0.00
20      0.00
21      0.00
22      0.00
23      0.00
24      0.00
25      0.00
26      0.00
27      0.00
28      0.00
29      0.00
        ... 
48714   0.00
48715   0.00
48716   0.00
48717   0.00
48718   0.00
48719   0.00
48720   0.00
48721   0.00
48722   0.00
48723   0.00
48724   0.00
48725   0.00
48726   0.00
48727   0.00
48728   0.00
48729   0.00
48730   0.00
48731   0.00
48732   0.00
48733   0.00
48734   0.00
48735   0.00
48736   0.00
48737   0.00
48738   0.00
48739   0.00
48740   0.00
48741   0.00
48742   0.00
48743   0.00
Name: TARGET, Length: 48744, dtype: float64

In [89]:
submit.to_csv("submit123.csv", index=False)