In [25]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from imblearn.combine import SMOTEENN
import sys
from efficient.ml_model import *
from efficient.utils import *
from efficient.eval_utils import *

data_df = pd.read_excel('data/train.xlsx')
test_df = pd.read_excel('data/test.xlsx')
submission_df = pd.read_csv('data/submission.csv')

In [26]:
data = data_df.values[:,:-1]
label=data_df.values[:,-1]
qt = QuantileTransformer(n_quantiles=100)
data = qt.fit_transform(data)

In [29]:
# split 9:1 
train_data, train_label, test_data, test_label = KFold_Sampler(data,label,n_splits=100).get_multi_fold_data(n_fold=10)
print(len(train_data),len(test_data),len(test_data)/(len(train_data)+len(test_data)))

# define model list
model_lgb = lgb.LGBMClassifier(is_unbalance=True)

model_dict = {'RF_3':RandomForestClassifier(n_estimators=310,max_depth=3),
              'RF_depth_None':RandomForestClassifier(n_estimators=310),
              'XGB_31_3':XGBClassifier(n_estimators=31,max_depth=3),
              'XGB_310_3':XGBClassifier(n_estimators=31,max_depth=3),
              'XGB_31':XGBClassifier(n_estimators=31),
              'XGB_310':XGBClassifier(n_estimators=310),
              'SVM':SVC(probability=True),
              'SVM_lin':SVC(kernel='linear',probability=True),
              'SVM_rbf':SVC(kernel='rbf',probability=True),
              'SVM_0.2':SVC(C=0.2,probability=True),
              'SVM_0.2_lin':SVC(C=0.2,kernel='linear',probability=True),
              'SVM_0.2_poly':SVC(C=0.2,kernel='poly',probability=True),
              'SVM_5':SVC(C=5,probability=True),
              'SVM_5_lin':SVC(C=5,kernel='linear',probability=True),
              'SVM_5_poly':SVC(C=5,kernel='poly',probability=True),
              'KNN_Cls':KNeighborsClassifier(),
              'LGB_Cls':model_lgb,
              'Ridge_Cls':RidgeClassifier(),
              'MLP_Cls':MLPClassifier(activation = "relu", alpha = 0.1, hidden_layer_sizes = (5,5),
                            learning_rate = "constant", max_iter = 3000, random_state = 1000),
              'QDA':QuadraticDiscriminantAnalysis(),
              }

model = Stack_Ensemble_Proba_Model(model_dict=model_dict, stack_model=SVC(C=0.1,kernel='rbf', probability=True), stack_training_split=0.15)
# model = Mean_Ensemble_Model(copy.deepcopy(model_lgb))
cv_model, cv_df = model.cross_validation_evaluate(train_data, train_label, accuracy_score,n_splits=5)

3510 393 0.10069177555726365


                                                                            



            model  eval_metric
0            RF_3     0.645299
1  Ensemble Model     0.693732


                                                                            



            model  eval_metric
0  Ensemble Model     0.668091
1            RF_3     0.679487


                                                                            



            model  eval_metric
0            RF_3     0.668091
1  Ensemble Model     0.695157


                                                                            



            model  eval_metric
0            RF_3     0.648148
1  Ensemble Model     0.672365


                                                                            



            model  eval_metric
0            RF_3     0.690883
1  Ensemble Model     0.720798

            model  eval_metric
0            RF_3     0.666382
1  Ensemble Model     0.690028




In [30]:
cv_ensemble_model = Mean_Ensemble_Proba_Model(cv_model)
model.fit(train_data,train_label)
cv_ensemble_model.fit(train_data,train_label)
print('========== Ensemble Model ==========')
_=model.evaluate(test_data,test_label,evaluation_fn=accuracy_score,verbose=True)
print('========== CV Ensemble Model ==========')
_=cv_ensemble_model.evaluate(test_data,test_label,evaluation_fn=accuracy_score,verbose=True)

                                                                            

            model  eval_metric
0            RF_3     0.676845
1  Ensemble Model     0.689567
            model  eval_metric
0         model_2     0.669211
1         model_4     0.676845
2         model_5     0.676845
3  Ensemble Model     0.676845
4         model_3     0.679389
5         model_1     0.684478




In [23]:
test_data = test_df.values[:,1:]
test_data = qt.transform(test_data)
test_pred = model.predict(test_data)
test_pred

array([0., 0., 0., ..., 0., 0., 0.])