In [21]:
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.datasets import make_blobs
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
import pandas as pd 
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report
import pickle 
import time

In [2]:
# create a list of base-models
def get_models():
    models = list()
    models.append(DecisionTreeClassifier())
    models.append(KNeighborsClassifier())
    models.append(AdaBoostClassifier())
    models.append(BaggingClassifier(n_estimators=10))
    models.append(ExtraTreesClassifier(n_estimators=10))
    return models

In [3]:
# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X, y, models):
    meta_X, meta_y = list(), list()
    # define split of data
    kfold = KFold(n_splits=2, shuffle=True)
    # enumerate splits
    for train_ix, test_ix in kfold.split(X):
        fold_yhats = list()
        # get data
        train_X, test_X = X.iloc[train_ix], X.iloc[test_ix]
        train_y, test_y = y.iloc[train_ix], y.iloc[test_ix]
        meta_y.extend(test_y)
        # fit and make predictions with each sub-model
        for model in models:
            model.fit(train_X, train_y)
            yhat = model.predict_proba(test_X)
            # store columns
            fold_yhats.append(yhat)
        # store fold yhats as columns
        meta_X.append(hstack(fold_yhats))
    return vstack(meta_X), asarray(meta_y)

In [4]:
# fit all base models on the training dataset
def fit_base_models(X, y, models):
    for model in models:
        model.fit(X, y)

In [5]:
# fit a meta model
def fit_meta_model(X, y):
    model = LogisticRegression(solver='liblinear')
    model.fit(X, y)
    return model

In [6]:
# evaluate a list of models on a dataset
def evaluate_models(X, ytest, models):
    for model in models:
        y_pred = model.predict(X)
        print(model.__class__.__name__)
        print(classification_report(ytest,y_pred))
        print('precision_score : '+str(precision_score(ytest, y_pred, average='weighted')))
        print('accuracy_score : '+str(accuracy_score(ytest, y_pred)))
        print('recall_score : '+str(recall_score(ytest, y_pred, average='weighted')))
        print('f1_score : '+str(f1_score(ytest, y_pred, average='weighted')))
        print('roc_auc_score : '+str(roc_auc_score(ytest,y_pred))) # TruePositive,TrueNegative
        tn, fp, fn, tp = confusion_matrix(ytest, y_pred).ravel()
        print('True_positive : '+str(tp)+', False_positive : '+str(fp)+', True_negative : '+str(tn)+', False_negative : '+str(fn))

In [7]:
def save_models(models, combiner_model):
    models_names=[]
    for model in models:
        with open("./models/"+model.__class__.__name__+".pickle","wb") as f:
            pickle.dump(model,f)
        models_names.append(model.__class__.__name__)
    with open("./models/models_names.pickle","wb") as f:
            pickle.dump(models_names,f)
    with open("./models/combiner_model.pickle","wb") as f:
            pickle.dump(combiner_model,f)

In [8]:
# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
    meta_X = list()
    for model in models:
        yhat = model.predict_proba(X)
        meta_X.append(yhat)
    meta_X = hstack(meta_X)
    # predict
    return meta_model.predict(meta_X)

In [9]:
def transformer_df(df):
    dic={}
    for c in df.columns :
        if (df[c].dtype =="object"):
            encoder = LabelEncoder()
            encoder.fit(df[c])
            df[c]=encoder.transform(df[c])
            dic[c]=encoder
    with open("./models/LabelEncoders_dic.pickle","wb") as f:
        pickle.dump(dic,f)
    return df

In [10]:
from sklearn.utils import shuffle
def shuffle_dataframe(df):
    df = shuffle(df)
    return df

In [11]:
def normaliser_all_columns(df):
    diction={}
    
    for c in df.columns :
        scaler=MinMaxScaler(feature_range=(0,1)).fit(df[c].values.reshape(-1,1))
        diction[c]=scaler        
        df[c]=scaler.transform(df[c].values.reshape(-1,1))
    with open("./models/MinMaxScalers_dic.pickle","wb") as f:
        pickle.dump(diction,f)
    return df

In [22]:
df =pd.read_csv("../../../data/full_data_small_datased.csv")
len(df)

278060

In [23]:
df=transformer_df(df)
df=normaliser_all_columns(df)
df=shuffle_dataframe(df)

In [24]:
# create the inputs and outputs
X, X_val, y, y_val=train_test_split(df.drop(labels=["Class"],axis=1),df["Class"],test_size=0.2)

In [25]:
print('Train', X.shape, y.shape, 'Test', X_val.shape, y_val.shape)
# get models
models = get_models()
# get out of fold predictions
start_time = time.time() 
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)

Train (222448, 115) (222448,) Test (55612, 115) (55612,)
Meta  (222448, 10) (222448,)


In [26]:
# fit base models
fit_base_models(X, y, models)

In [27]:
# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)
print("--- %s seconds ---" % (time.time() - start_time))

--- 328.96815371513367 seconds ---


In [28]:
# evaluate base models
evaluate_models(X_val, y_val, models)

DecisionTreeClassifier
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8073
         1.0       1.00      1.00      1.00     47539

    accuracy                           1.00     55612
   macro avg       1.00      1.00      1.00     55612
weighted avg       1.00      1.00      1.00     55612

precision_score : 1.0
accuracy_score : 1.0
recall_score : 1.0
f1_score : 1.0
roc_auc_score : 1.0
True_positive : 47539, False_positive : 0, True_negative : 8073, False_negative : 0
KNeighborsClassifier
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8073
         1.0       1.00      1.00      1.00     47539

    accuracy                           1.00     55612
   macro avg       1.00      1.00      1.00     55612
weighted avg       1.00      1.00      1.00     55612

precision_score : 1.0
accuracy_score : 1.0
recall_score : 1.0
f1_score : 1.0
roc_auc_score : 1.0
True_positive : 47539, F

In [31]:
# evaluate meta model
start_time = time.time() 
y_pred = super_learner_predictions(X_val, models, meta_model)
print("--- %s seconds ---" % (time.time() - start_time))
print(classification_report(y_val,y_pred))
print('precision_score : '+str(precision_score(y_val, y_pred, average='weighted')))
print('accuracy_score : '+str(accuracy_score(y_val, y_pred)))
print('recall_score : '+str(recall_score(y_val, y_pred, average='weighted')))
print('f1_score : '+str(f1_score(y_val, y_pred, average='weighted')))
print('roc_auc_score : '+str(roc_auc_score(y_val,y_pred))) # TruePositive,TrueNegative


--- 24.252958297729492 seconds ---
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      8073
         1.0       1.00      1.00      1.00     47539

    accuracy                           1.00     55612
   macro avg       1.00      1.00      1.00     55612
weighted avg       1.00      1.00      1.00     55612

precision_score : 1.0
accuracy_score : 1.0
recall_score : 1.0
f1_score : 1.0
roc_auc_score : 1.0


In [33]:
tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
print('True_positive : '+str(tp)+', False_positive : '+str(fp)+', True_negative : '+str(tn)+', False_negative : '+str(fn))

True_positive : 47539, False_positive : 0, True_negative : 8073, False_negative : 0


In [30]:
save_models(models, meta_model)