In [1]:
import numpy as np
import pandas as pd
import os

from itertools import product
from sklearn.mixture import GaussianMixture

import sys

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

from mlxtend.classifier import StackingCVClassifier
    

In [2]:
X    = pd.read_csv('./X.csv')
Y    = pd.read_csv('./Y.csv')

In [3]:
X.shape

(581012, 42)

In [4]:
Y.shape

(581012, 1)

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                      Y, 
                                                      test_size = 0.20,
                                                      random_state=42)

# First Level Moldes

In [6]:
rf = RandomForestClassifier(n_estimators=600, 
                            criterion='gini',
                            max_depth=133,
                            max_features='auto',
                            random_state=42)

xgb = XGBClassifier(learning_rate=0.1, n_estimators=450, max_depth=25,
                        min_child_weight=3, gamma=0.05, subsample=0.6, colsample_bytree=1.0,
                        objective='multiclass:softmax', nthread=4, scale_pos_weight=1, seed=42)

xtc=ExtraTreesClassifier(
           max_depth=350, 
           n_estimators=450, n_jobs=-1,
           oob_score=False, random_state=42, 
           warm_start=True)

ada=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth  = 20,
                                                                        min_samples_split = 2,
                                                                        min_samples_leaf = 1,
                                                                        random_state=42),
                                                                        n_estimators=100,
                                                                        random_state=42)

lgb_model=lgb.LGBMClassifier(n_estimators=375,
                        num_leaves=100,
                        verbosity=0,
                        random_state=42,
                        n_jobs=-1)

cat= CatBoostClassifier(n_estimators =6000, 
                        #loss_function='Logloss',
                        eval_metric='Accuracy',
                        metric_period=1000,
                        max_depth = None, 
                        random_state=42)


# Second Level Model

In [7]:
ensemble = [('rf', rf),
            ('xgb', xgb),
            ('ada', ada),
           ('lgbm', lgb_model),
           ('xtc', xtc),
           ('cat', cat)]

In [9]:
#with rf
stack = StackingCVClassifier(classifiers=[clf for label, clf in ensemble],
                             meta_classifier=rf,
                             cv=3,
                             use_probas=True,
                             use_features_in_secondary=False,
                             verbose=1,
                             random_state=42,
                             n_jobs=-1)

In [11]:
stack = stack.fit(X_train, Y_train.values)

Fitting 6 classifiers...
Fitting classifier1: randomforestclassifier (1/6)


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 23.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier2: xgbclassifier (2/6)


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 149.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier3: adaboostclassifier (3/6)


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 34.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier4: lgbmclassifier (4/6)


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  4.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier5: extratreesclassifier (5/6)


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  5.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier6: catboostclassifier (6/6)


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 60.5min finished


0:	learn: 0.6812390	total: 563ms	remaining: 56m 17s
1000:	learn: 0.8460938	total: 5m	remaining: 25m 1s
2000:	learn: 0.8836425	total: 9m 57s	remaining: 19m 54s
3000:	learn: 0.9056752	total: 14m 59s	remaining: 14m 58s
4000:	learn: 0.9201091	total: 20m 1s	remaining: 10m
5000:	learn: 0.9301885	total: 25m 5s	remaining: 5m
5999:	learn: 0.9378885	total: 30m 12s	remaining: 0us


In [12]:
prediction_test = stack.predict(X_test.values)

In [13]:
print("Test accuracy:",metrics.accuracy_score(Y_test, prediction_test))

Test accuracy: 0.9789936576508351
