In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV, cross_validate

# importing two different imputation methods that take into consideration all the features when predicting the missing values
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#multiclass imports
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.dummy import DummyClassifier #Will identify the maority calss base line, model needs to do better then the baseline

from statistics import mean
# to reduce randomness then you put the seed
np.random.seed(42)

from ArtificialImmuneSystem import *
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import KFold

from tabulate import tabulate
import time
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score


In [33]:
df = pd.read_csv('Data\GeneratedSyntheticData-testing.csv')
#df = df.drop('Unnamed: 0', axis=1)

In [34]:
print(f"Data shape: \n{df.shape}\n")
print(f"Data size: \n{df.size}\n")
print(f"Data ndim: \n{df.ndim}\n")
print("_____________________________________________\n")
print(f"Old Class Distribution: {Counter(df['5'])}")

Data shape: 
(300, 6)

Data size: 
1800

Data ndim: 
2

_____________________________________________

Old Class Distribution: Counter({0.0: 247, 1.0: 53})


In [51]:
kf = KFold(n_splits=5)

for result in kf.split(df):
    
    #Print the shape of the train and test set
    data_train = df.iloc[result[0]]
    data_test =  df.iloc[result[1]]
    #print(f"Train Data shape: \n{data_train}\n")
    #print(f"Test Data shape: \n{data_test}\n")


    data_train_AIS = data_train.copy()
    data_train_SMOTE = data_train.copy()

    #Create an oversampling object
    
    oversample = SMOTE()
    


    oversample_AIS = ArtificialImmuneSystem()
    #Oversample and add to the dataframe to fix the class imbalance
    randomForest = RandomForestClassifier()

    st = time.time()
    x_over, y_over = oversample.fit_resample(data_train_SMOTE.drop(["5"], axis=1), data_train_SMOTE.drop(data_train_SMOTE.columns[0:-1],axis=1))
    elapsed_time_SMOTE = time.time() - st

    st = time.time()
    input_x_over_AIS, y_over_AIS = oversample_AIS.AIS_Resample(data_train_AIS.drop(["5"], axis=1), data_train_AIS.drop(data_train_AIS.columns[0:-1],axis=1), 20, 5, randomForest,5,'balanced_accuracy',min_change = 0.005, use_lof = True)
    elapsed_time_AIS = time.time() - st

    smote_df = pd.concat([x_over, y_over], axis=1)
    ais_df = pd.concat([input_x_over_AIS, y_over_AIS], axis=1)

    # print the dimensionality of the oversampled dataset
    #print(f"SMOTE Oversampled Data shape: \n{smote_df.shape}\n")
    #print(f"SMOTE Oversampled Data size: \n{smote_df.size}\n")
    #print(f"SMOTE Oversampled Data ndim: \n{smote_df.ndim}\n")
    #print("_____________________________________________\n")

    # print the dimensionality of the oversampled dataset
    #print(f"AIS Oversampled Data shape: \n{ais_df.shape}\n")
    #print(f"AIS Oversampled Data size: \n{ais_df.size}\n")
    #print(f"AIS Oversampled Data ndim: \n{ais_df.ndim}\n")
    #print("_____________________________________________\n")


    # print the new class distribution using a Counter
    #print(f"New SMOTE Class Distribution: {Counter(smote_df['5'])}")
    #print(f"New AIS Class Distribution: {Counter(ais_df['5'])}")
    ## print the new class distribution using a Counter
    #print(f"Old Class Distribution: {Counter(data_train['5'])}")

    #print("_____________________________________________\n")

    #labelTrainFlat = labels_train.values.ravel()

    #Fit one vs rest Gradient Boosting classification
    gradientBoosting = GradientBoostingClassifier()
    gradientBoosting = gradientBoosting.fit(x_over, y_over.values.ravel())

    gradientBoosting_AIS = GradientBoostingClassifier()
    gradientBoosting_AIS = gradientBoosting.fit(input_x_over_AIS, y_over_AIS.values.ravel())

    gradientBoosting_Base = gradientBoosting.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))

    #Fit RandomForestClassifier classification
    randomForest = RandomForestClassifier()
    randomForest = randomForest.fit(x_over,y_over.values.ravel())

    randomForest = RandomForestClassifier()
    randomForest_AIS  = randomForest.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    randomForest_Base = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
    #randomForest_Base  = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1]).values.ravel())

    #Create a KNeighbors classification object
    kNeighbors = KNeighborsClassifier()
    kNeighbors = kNeighbors.fit(x_over,y_over.values.ravel())

    kNeighbors = KNeighborsClassifier()
    kNeighbors_AIS  = kNeighbors.fit(input_x_over_AIS,y_over_AIS.values.ravel())
    kNeighbors_Base = kNeighbors.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
    #kNeighbors_base  = kNeighbors.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1]).values.ravel())

    #Create an LogisticRegression object
    logisticRegression = LogisticRegression(max_iter=5000)
    logisticRegression = logisticRegression.fit(x_over,y_over.values.ravel())

    logisticRegression = LogisticRegression(max_iter=5000)
    logisticRegression_AIS  = logisticRegression.fit(input_x_over_AIS,y_over_AIS.values.ravel())
    logisticRegression_Base = logisticRegression.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
    #logisticRegression_Base  = logisticRegression.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1]).values.ravel())

    #Set the parameters of GradientBoosting for GridSearchCV
    parametersGradientBoosting = [
        {'learning_rate': [0.44,0.45,0.46],'min_samples_leaf': [5,6,7],'min_samples_split': [7,8,9,10], 'n_estimators': [57,58,59,60]}
    ]

    #Set the scoring parameters
    scoringX = {"roc_auc": "roc_auc", "bal_accuracy": "balanced_accuracy"}

    #Preform Gridsearch to find best parameters
    grid_searchGradientBoosting = GridSearchCV(gradientBoosting, parametersGradientBoosting, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchGradientBoosting_AIS = GridSearchCV(gradientBoosting_AIS, parametersGradientBoosting, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    #grid_searchGradientBoosting_Base = GridSearchCV(gradientBoosting_Base, parametersGradientBoosting, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    #Fit the GradientBoosting 
    grid_searchGradientBoosting.fit(x_over, y_over.values.ravel())
    grid_searchGradientBoosting_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())
  
    

    #Print GridSearchCV Results
    
    print(f"Best score GradientBoosting: \n{grid_searchGradientBoosting.best_score_}\n")
    print("_____________________________________________\n")
    
    print(f"Best score GradientBoosting AIS: \n{grid_searchGradientBoosting_AIS.best_score_}\n")

    #Set the parameters of KNeighbors for GridSearchCV
    parametersKNeighbors = [
        {'n_neighbors': [1,2,3],'weights':['uniform', 'distance'],'algorithm':['auto'], 'p': [1,2,3]}
    ]

    #Set the scoring parameters
    scoringX = {"roc_auc": "roc_auc", "bal_accuracy": "balanced_accuracy"}

    #Preform KNeighbors to find best parameters
    grid_searchKNeighbors = GridSearchCV(kNeighbors, parametersKNeighbors, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchKNeighbors_AIS = GridSearchCV(kNeighbors_AIS, parametersKNeighbors, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

    #Fit the KNeighbors 
    grid_searchKNeighbors.fit(x_over, y_over.values.ravel())
    grid_searchKNeighbors_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #Print GridSearchCV Results
   
    print(f"Best score KNeighbors: \n{grid_searchKNeighbors.best_score_}\n")
    print("_____________________________________________\n")
   
    print(f"Best score KNeighbors AIS: \n{grid_searchKNeighbors_AIS.best_score_}\n")
    
    #Set the parameters of LogisticRegression for GridSearchCV
    parametersLogisticRegression = [
        {'multi_class': ['ovr'],'penalty':['none','l2'], 'C': [1,2,3]}
    ]
    scoringX = {"roc_auc": "roc_auc", "bal_accuracy": "balanced_accuracy"}

    #Preform LogisticRegression to find best parameters
    grid_searchLogisticRegression = GridSearchCV(logisticRegression, parametersLogisticRegression, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchLogisticRegression_AIS = GridSearchCV(logisticRegression_AIS, parametersLogisticRegression, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

    #Fit the LogisticRegression 
    grid_searchLogisticRegression.fit(x_over, y_over.values.ravel())
    grid_searchLogisticRegression_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #Print LogisticRegression Results
    
    print(f"Best score Logistic Regression: \n{grid_searchLogisticRegression.best_score_}\n")
    print("_____________________________________________\n")
   
    print(f"Best score Logistic Regression AIS: \n{grid_searchLogisticRegression_AIS.best_score_}\n")

    #Set the parameters of RandomForest for GridSearchCV
    parametersRandomForest = [
        {'n_estimators': [145,150,155,190],'max_depth': [10,12], 'bootstrap': [True, False],
        'min_samples_split': [0.05,2], 'max_features': ['auto']}
    ]

    #Preform Gridsearch to find best parameters
    grid_searchRandomForest = GridSearchCV(randomForest, parametersRandomForest, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchRandomForest_AIS = GridSearchCV(randomForest_AIS, parametersRandomForest, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

    #Fit the RandomForest 
    grid_searchRandomForest.fit(x_over, y_over.values.ravel())
    grid_searchRandomForest_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #Print GridSearchCV Results

    print(f"Best score RandomForest: \n{grid_searchRandomForest.best_score_}\n")

    print(f"Best score RandomForest AIS: \n{grid_searchRandomForest_AIS.best_score_}\n")

    #Get the results for all classifiers 
    cross_val_resultsGB = grid_searchGradientBoosting.cv_results_
    cross_val_resultsRF = grid_searchRandomForest.cv_results_
    cross_val_resultsLR = grid_searchLogisticRegression.cv_results_
    cross_val_resultsKN = grid_searchKNeighbors.cv_results_

    cross_val_resultsGB_AIS = grid_searchGradientBoosting_AIS.cv_results_
    cross_val_resultsRF_AIS = grid_searchRandomForest_AIS.cv_results_
    cross_val_resultsLR_AIS = grid_searchLogisticRegression_AIS.cv_results_
    cross_val_resultsKN_AIS = grid_searchKNeighbors_AIS.cv_results_


    #Print the results of all classiifiers
    #GBC
    mean_test_roc_aucGB = mean(cross_val_resultsGB['mean_test_roc_auc'])
    mean_test_bal_accuracyGB = mean(cross_val_resultsGB['mean_test_bal_accuracy'])
    
    mean_test_roc_aucGB_AIS = mean(cross_val_resultsGB_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyGB_AIS = mean(cross_val_resultsGB_AIS['mean_test_bal_accuracy'])
   
    #RFC
    mean_test_roc_aucRF = mean(cross_val_resultsRF['mean_test_roc_auc'])
    mean_test_bal_accuracyRF = mean(cross_val_resultsRF['mean_test_bal_accuracy'])
    
    mean_test_roc_aucRF_AIS = mean(cross_val_resultsRF_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyRF_AIS = mean(cross_val_resultsRF_AIS['mean_test_bal_accuracy'])
    #LRC
    mean_test_roc_aucLR = mean(cross_val_resultsLR['mean_test_roc_auc'])
    mean_test_bal_accuracyLR = mean(cross_val_resultsLR['mean_test_bal_accuracy'])
    
    mean_test_roc_aucLR_AIS = mean(cross_val_resultsLR_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyLR_AIS = mean(cross_val_resultsLR_AIS['mean_test_bal_accuracy'])

    #KNC
    mean_test_roc_aucKN = mean(cross_val_resultsKN['mean_test_roc_auc'])
    mean_test_bal_accuracyKN = mean(cross_val_resultsKN['mean_test_bal_accuracy'])
    
    mean_test_roc_aucKN_AIS = mean(cross_val_resultsKN_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyKN_AIS = mean(cross_val_resultsKN_AIS['mean_test_bal_accuracy'])

    predictions_test_over_GB = grid_searchGradientBoosting.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_RF = grid_searchRandomForest.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_LR = grid_searchLogisticRegression.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_KN = grid_searchKNeighbors.best_estimator_.predict(data_test.drop(["5"],axis=1))

    predictions_test_over_GB_AIS = grid_searchGradientBoosting_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_RF_AIS = grid_searchRandomForest_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_LR_AIS = grid_searchLogisticRegression_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_KN_AIS = grid_searchKNeighbors_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))

    predictions_GB = gradientBoosting_Base.predict(data_test.drop(["5"],axis=1))
    predictions_RF = randomForest_Base.predict(data_test.drop(["5"],axis=1))
    predictions_LR = logisticRegression_Base.predict(data_test.drop(["5"],axis=1))
    predictions_KN = kNeighbors_Base.predict(data_test.drop(["5"],axis=1))

    geometric_mean_score_GB = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_RF = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_LR = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_KN = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN, labels=None, pos_label=1, average='binary',)

    geometric_mean_score_GB_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB_AIS, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_RF_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF_AIS, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_LR_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR_AIS, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_KN_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN_AIS, labels=None, pos_label=1, average='binary',)

    geometric_mean_score_GB_Base = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_GB, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_RF_Base = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_RF, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_LR_Base = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_LR, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_KN_Base = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_KN, labels=None, pos_label=1, average='binary',)

    
    roc_auc_GB_AIS = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB_AIS)
    roc_auc_RF_AIS = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF_AIS)
    roc_auc_LR_AIS = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR_AIS)
    roc_auc_KN_AIS = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN_AIS)
 
    balanced_acc_GB_AIS = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB_AIS)
    balanced_acc_RF_AIS = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF_AIS)
    balanced_acc_LR_AIS = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR_AIS)
    balanced_acc_KN_AIS = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN_AIS)

    roc_auc_GB = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB)
    roc_auc_RF = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF)
    roc_auc_LR = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR)
    roc_auc_KN = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN)
 
    balanced_acc_GB = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB)
    balanced_acc_RF = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF)
    balanced_acc_LR = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR)
    balanced_acc_KN = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN)

    roc_auc_GB_Base = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_GB)
    roc_auc_RF_Base = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_RF)
    roc_auc_LR_Base = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_LR)
    roc_auc_KN_Base = roc_auc_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_KN)
 
    balanced_acc_GB_Base = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_GB)
    balanced_acc_RF_Base = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_RF)
    balanced_acc_LR_Base = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_LR)
    balanced_acc_KN_Base = balanced_accuracy_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_KN)

    col_names = ["Oversample","Oversample Run Time", "Oversample Paramaters", "ROC Accuracy for Gradient Boosting", "Balanced Test Accuracy for Gradient Boosting", "ROC Accuracy for Random Forests", "Balanced Test Accuracy for Random Forests" , "ROC Accuracy for Logistic Regression", "Balanced Test Accuracy for Logistic Regression", "ROC Accuracy for K Nearest Neighbours", "Balanced Test Accuracy for K Nearest Neighbours", "Geometric Mean Score for Gradient Boosting", "Geometric Mean Score for Random Forest", "Geometric Mean Score for Logestic Regression", "Geometric Mean Score for K Neighbors"]
    dataAIS = ["AIS", elapsed_time_AIS,"MinChange = 0.005, LOF=True", roc_auc_GB_AIS, balanced_acc_GB_AIS, roc_auc_RF_AIS, balanced_acc_RF_AIS, roc_auc_LR_AIS, balanced_acc_LR_AIS, roc_auc_KN_AIS, balanced_acc_KN_AIS, geometric_mean_score_GB_AIS, geometric_mean_score_RF_AIS, geometric_mean_score_LR_AIS, geometric_mean_score_KN_AIS]
    dataSMOTE = ["SMOTE", elapsed_time_SMOTE,"NA", roc_auc_GB, balanced_acc_GB, roc_auc_RF, balanced_acc_RF, roc_auc_LR, balanced_acc_LR, roc_auc_KN, balanced_acc_KN, geometric_mean_score_GB, geometric_mean_score_RF, geometric_mean_score_LR, geometric_mean_score_KN]
    dataBase = ["BASE", "NA","NA", roc_auc_GB_Base, balanced_acc_GB_Base, roc_auc_RF_Base, balanced_acc_RF_Base, roc_auc_LR_Base, balanced_acc_LR_Base, roc_auc_KN_Base, balanced_acc_KN_Base, geometric_mean_score_GB_Base, geometric_mean_score_RF_Base, geometric_mean_score_LR_Base, geometric_mean_score_KN_Base]
    data = [dataAIS,dataSMOTE,dataBase]

    dfoutput=pd.DataFrame(data,columns=col_names)
    print(dfoutput)
    title = "ExperimentalResults/ExperimentalComparisons-Dec05DEMO1.csv"
    dfoutput.to_csv(title, mode='a') 

col_names = ["","", "", "", "", "", "" , "", "", "", "", "", "", "", ""]
dfoutput=pd.DataFrame(col_names)
dfoutput.to_csv(title, mode='a',header=False,index=False) 
    

origin_feat_train before:  (192, 5)
origin_labels_train before:  Counter({0.0: 154, 1.0: 38})
origin_feat_train after:  (240, 5)
population_features:  (156, 5)
origin_labels_train after:  Counter({0.0: 195, 1.0: 45})
origin_feat_train before:  (192, 5)
origin_labels_train before:  Counter({0.0: 160, 1.0: 32})
origin_feat_train after:  (240, 5)
population_features:  (156, 5)
origin_labels_train after:  Counter({0.0: 202, 1.0: 38})
origin_feat_train before:  (192, 5)
origin_labels_train before:  Counter({0.0: 159, 1.0: 33})
origin_feat_train after:  (240, 5)
population_features:  (156, 5)
origin_labels_train after:  Counter({0.0: 198, 1.0: 42})
origin_feat_train before:  (192, 5)
origin_labels_train before:  Counter({0.0: 161, 1.0: 31})
origin_feat_train after:  (240, 5)
population_features:  (156, 5)
origin_labels_train after:  Counter({0.0: 200, 1.0: 40})
origin_feat_train before:  (192, 5)
origin_labels_train before:  Counter({0.0: 155, 1.0: 37})
origin_feat_train after:  (240, 5)
pop

  y = column_or_1d(y, warn=True)
  randomForest_Base = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


Best score GradientBoosting: 
0.7626020408163265

_____________________________________________

Best score GradientBoosting AIS: 
0.8881122448979593

Best score KNeighbors: 
0.782704081632653

_____________________________________________

Best score KNeighbors AIS: 
0.8321428571428571

Best score Logistic Regression: 
0.5578571428571428

_____________________________________________

Best score Logistic Regression AIS: 
0.7661734693877551



  warn(
  warn(


Best score RandomForest: 
0.7171428571428571

Best score RandomForest AIS: 
0.9108163265306122

  Oversample Oversample Run Time        Oversample Paramaters  \
0        AIS           49.966202  MinChange = 0.005, LOF=True   
1      SMOTE            0.005995                           NA   
2       BASE                  NA                           NA   

   ROC Accuracy for Gradient Boosting  \
0                            0.787570   
1                            0.670686   
2                            0.671614   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.787570   
1                                      0.670686   
2                                      0.671614   

   ROC Accuracy for Random Forests  Balanced Test Accuracy for Random Forests  \
0                         0.727273                                   0.727273   
1                         0.726345                                   0.726345   
2                         0.68

  y = column_or_1d(y, warn=True)
  randomForest_Base = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


Best score GradientBoosting: 
0.8066007653061225

_____________________________________________

Best score GradientBoosting AIS: 
0.8780293367346939

Best score KNeighbors: 
0.7990539965986394

_____________________________________________

Best score KNeighbors AIS: 
0.8106398809523809

Best score Logistic Regression: 
0.4901147959183674

_____________________________________________

Best score Logistic Regression AIS: 
0.7926232993197279



  warn(
  warn(


Best score RandomForest: 
0.7627551020408163

Best score RandomForest AIS: 
0.8882865646258504

  Oversample Oversample Run Time        Oversample Paramaters  \
0        AIS           45.913421  MinChange = 0.005, LOF=True   
1      SMOTE            0.004039                           NA   
2       BASE                  NA                           NA   

   ROC Accuracy for Gradient Boosting  \
0                            0.648248   
1                            0.582210   
2                            0.676550   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.648248   
1                                      0.582210   
2                                      0.676550   

   ROC Accuracy for Random Forests  Balanced Test Accuracy for Random Forests  \
0                         0.614555                                   0.614555   
1                         0.601078                                   0.601078   
2                         0.66

  y = column_or_1d(y, warn=True)
  randomForest_Base = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


Best score GradientBoosting: 
0.7849999999999999

_____________________________________________

Best score GradientBoosting AIS: 
0.905

Best score KNeighbors: 
0.8025

_____________________________________________

Best score KNeighbors AIS: 
0.855

Best score Logistic Regression: 
0.6275

_____________________________________________

Best score Logistic Regression AIS: 
0.7300000000000001



  warn(
  warn(


Best score RandomForest: 
0.76

Best score RandomForest AIS: 
0.92

  Oversample Oversample Run Time        Oversample Paramaters  \
0        AIS           64.268367  MinChange = 0.005, LOF=True   
1      SMOTE             0.00509                           NA   
2       BASE                  NA                           NA   

   ROC Accuracy for Gradient Boosting  \
0                            0.545008   
1                            0.624386   
2                            0.545008   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.545008   
1                                      0.624386   
2                                      0.545008   

   ROC Accuracy for Random Forests  Balanced Test Accuracy for Random Forests  \
0                         0.566285                                   0.566285   
1                         0.652209                                   0.652209   
2                         0.594108                        

  y = column_or_1d(y, warn=True)
  randomForest_Base = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


Best score GradientBoosting: 
0.75

_____________________________________________

Best score GradientBoosting AIS: 
0.8725490196078431

Best score KNeighbors: 
0.7401960784313726

_____________________________________________

Best score KNeighbors AIS: 
0.8186274509803921

Best score Logistic Regression: 
0.5759803921568627

_____________________________________________

Best score Logistic Regression AIS: 
0.7647058823529411



  warn(
  warn(


Best score RandomForest: 
0.75

Best score RandomForest AIS: 
0.8995098039215685

  Oversample Oversample Run Time        Oversample Paramaters  \
0        AIS           51.172886  MinChange = 0.005, LOF=True   
1      SMOTE            0.004996                           NA   
2       BASE                  NA                           NA   

   ROC Accuracy for Gradient Boosting  \
0                            0.677155   
1                            0.590287   
2                            0.664843   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.677155   
1                                      0.590287   
2                                      0.664843   

   ROC Accuracy for Random Forests  Balanced Test Accuracy for Random Forests  \
0                         0.712038                                   0.712038   
1                         0.689466                                   0.689466   
2                         0.712038          

  y = column_or_1d(y, warn=True)
  randomForest_Base = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1],axis=1))
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)


Best score GradientBoosting: 
0.7057291666666666

_____________________________________________

Best score GradientBoosting AIS: 
0.8567708333333334

Best score KNeighbors: 
0.7109375

_____________________________________________

Best score KNeighbors AIS: 
0.8203125

Best score Logistic Regression: 
0.6171875

_____________________________________________

Best score Logistic Regression AIS: 
0.8098958333333333



  warn(
  warn(


Best score RandomForest: 
0.6901041666666667

Best score RandomForest AIS: 
0.890625

  Oversample Oversample Run Time        Oversample Paramaters  \
0        AIS           55.636309  MinChange = 0.005, LOF=True   
1      SMOTE             0.00905                           NA   
2       BASE                  NA                           NA   

   ROC Accuracy for Gradient Boosting  \
0                            0.563636   
1                            0.627273   
2                            0.681818   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.563636   
1                                      0.627273   
2                                      0.681818   

   ROC Accuracy for Random Forests  Balanced Test Accuracy for Random Forests  \
0                         0.681818                                   0.681818   
1                         0.500000                                   0.500000   
2                         0.554545      

In [36]:
#aisOversample = ArtificialImmuneSystem()
#minority_class = df[df['5'] == 1]
#majority_class = df[df['5'] == 0]

#requiredPopulation = len(majority_class)-len(minority_class)
#population = aisOversample.AIS(minority_class, max_rounds=100, totalPopulation=requiredPopulation)


In [37]:
#Extracting Labels
#Get a list of all columns
#columns = data_train.columns.to_list()
#Remove the label and save it
#columns_drop = columns.pop(-1)

#Remove all labels except for the label in the train and test dataframe
#labels_train = data_train.drop(columns, axis=1)
#labels_test = data_test.drop(columns, axis=1)

#Print the labesl of the test and train
#print(f"labels_train: \n{labels_train}\n")
#print(f"labels_test: \n{labels_test}\n")

#Remove the label from the train and test dataframe
#features_train = data_train.drop(['5'], axis=1)
#features_test = data_test.drop(['5'], axis=1)

#Print the features of the train and test dataset
#print(f"features_train: \n{features_train }\n")
#print(f"lfeatures_test: \n{features_test }\n")