In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from imblearn.combine import SMOTEENN
import sys
from efficient.ml_model import *
from efficient.utils import *
from efficient.eval_utils import *

data_df = pd.read_excel('data/train.xlsx')
test_df = pd.read_excel('data/test.xlsx')
submission_df = pd.read_csv('data/submission.csv')

In [2]:
data = data_df.values[:,:-1]
label=data_df.values[:,-1]
qt = QuantileTransformer(n_quantiles=100)
data = qt.fit_transform(data)

In [3]:
# split 9:1 
train_data, train_label, test_data, test_label = KFold_Sampler(data,label,n_splits=100).get_multi_fold_data(n_fold=10)
print(len(train_data),len(test_data),len(test_data)/(len(train_data)+len(test_data)))

# define model list
model_lgb = lgb.LGBMClassifier(is_unbalance=True)
model_dict = {'RF_3':RandomForestClassifier(n_estimators=310,max_depth=3),
              'RF_depth_None':RandomForestClassifier(n_estimators=310),
              'XGB_31_3':XGBClassifier(n_estimators=31,max_depth=3),
              'XGB_310_3':XGBClassifier(n_estimators=31,max_depth=3),
              'XGB_31':XGBClassifier(n_estimators=31),
              'XGB_310':XGBClassifier(n_estimators=310),
              'SVM':SVC(probability=True),
              'SVM_lin':SVC(kernel='linear',probability=True),
              'SVM_rbf':SVC(kernel='rbf',probability=True),
              'SVM_0.2':SVC(C=0.2,probability=True),
              'SVM_0.2_lin':SVC(C=0.2,kernel='linear',probability=True),
              'SVM_0.2_poly':SVC(C=0.2,kernel='poly',probability=True),
              'SVM_5':SVC(C=5,probability=True),
              'SVM_5_lin':SVC(C=5,kernel='linear',probability=True),
              'SVM_5_poly':SVC(C=5,kernel='poly',probability=True),
              'KNN_Cls':KNeighborsClassifier(),
              'LGB_Cls':model_lgb,
              'Ridge_Cls':RidgeClassifier(),
              'MLP_Cls':MLPClassifier(activation = "relu", alpha = 0.1, hidden_layer_sizes = (5,5),
                            learning_rate = "constant", max_iter = 3000, random_state = 1000),
              'QDA':QuadraticDiscriminantAnalysis(),
              }

model = Stack_Ensemble_Proba_Model(model_dict=model_dict, stack_model=SVC(C=0.1,kernel='rbf'), stack_training_split=0.15)
cv_model, cv_df = model.cross_validation_evaluate(train_data, train_label, accuracy_score,n_splits=5)

3510 393 0.10069177555726365
Ridge_Cls don't have [predict_proba]


   eval_metric          model
0     0.576923            QDA
0     0.660969           RF_3
0     0.662393    SVM_0.2_lin
0     0.668091        SVM_lin
0     0.669516      SVM_5_lin
0     0.675214        MLP_Cls
0     0.686610        XGB_310
0     0.688034        KNN_Cls
0     0.688034        SVM_0.2
0     0.692308   SVM_0.2_poly
0     0.693732     SVM_5_poly
0     0.700855         XGB_31
0     0.709402            SVM
0     0.709402      XGB_310_3
0     0.709402       XGB_31_3
0     0.709402        SVM_rbf
0     0.713675        LGB_Cls
0     0.722222          SVM_5
0     0.730769       Ensemble
0     0.733618  RF_depth_None


   eval_metric          model
0     0.517094            QDA
0     0.660969    SVM_0.2_lin
0     0.663818        SVM_lin
0     0.668091        SVM_0.2
0     0.669516      SVM_5_lin
0     0.679487        KNN_Cls
0     0.680912           RF_3
0     0.692308        MLP_Cls
0     0.693732   SVM_0.2_poly


In [6]:
cv_ensemble_model = Vote_Ensemble_Model(cv_model)
model.fit(train_data,train_label)
cv_ensemble_model.fit(train_data,train_label)
print('========== Ensemble Model ==========')
_=model.evaluate(test_data,test_label,evaluation_fn=accuracy_score,verbose=True)
print('========== CV Ensemble Model ==========')
_=cv_ensemble_model.evaluate(test_data,test_label,evaluation_fn=accuracy_score,verbose=True)

RF_3: 0.6590330788804071
RF_depth_None: 0.7404580152671756
XGB_31_3: 0.7353689567430025
XGB_310_3: 0.7353689567430025
XGB_31: 0.7353689567430025
XGB_310: 0.7480916030534351
SVM: 0.7506361323155216
SVM_lin: 0.6972010178117048
SVM_rbf: 0.7506361323155216
SVM_0.2: 0.712468193384224
SVM_0.2_lin: 0.6844783715012722
SVM_0.2_poly: 0.7302798982188295
SVM_5: 0.7608142493638677
SVM_5_lin: 0.6997455470737913
SVM_5_poly: 0.712468193384224
KNN_Cls: 0.7022900763358778
LGB_Cls: 0.7379134860050891
MLP_Cls: 0.7175572519083969
QDA: 0.5801526717557252
Ensemble: 0.7557251908396947
model_1: 0.7531806615776081
model_2: 0.7582697201017812
model_3: 0.7684478371501272
model_4: 0.7608142493638677
model_5: 0.7786259541984732
Ensemble: 0.7659033078880407


In [7]:
test_data = test_df.values[:,1:]
test_data = qt.transform(test_data)
test_pred = model.predict(test_data)
test_pred

array([0., 0., 0., ..., 0., 0., 0.])