In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV, cross_validate

# importing two different imputation methods that take into consideration all the features when predicting the missing values
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#multiclass imports
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.dummy import DummyClassifier #Will identify the maority calss base line, model needs to do better then the baseline

from statistics import mean
# to reduce randomness then you put the seed
np.random.seed(42)

from ArtificialImmuneSystem import *
from imblearn.metrics import geometric_mean_score
from sklearn.model_selection import KFold

from tabulate import tabulate
import time


In [24]:
df = pd.read_csv('Data\GeneratedSyntheticData-testing.csv')
#df = df.drop('Unnamed: 0', axis=1)

In [25]:
print(f"Data shape: \n{df.shape}\n")
print(f"Data size: \n{df.size}\n")
print(f"Data ndim: \n{df.ndim}\n")
print("_____________________________________________\n")
print(f"Old Class Distribution: {Counter(df['5'])}")

Data shape: 
(300, 6)

Data size: 
1800

Data ndim: 
2

_____________________________________________

Old Class Distribution: Counter({0.0: 247, 1.0: 53})


In [31]:
kf = KFold(n_splits=5)

for result in kf.split(df):
    
    #Print the shape of the train and test set
    data_train = df.iloc[result[0]]
    data_test =  df.iloc[result[1]]
    print(f"Train Data shape: \n{data_train}\n")
    print(f"Test Data shape: \n{data_test}\n")


    data_train_AIS = data_train.copy()
    data_train_SMOTE = data_train.copy()

    #Create an oversampling object
    
    oversample = SMOTE()
    


    oversample_AIS = ArtificialImmuneSystem()
    #Oversample and add to the dataframe to fix the class imbalance
    randomForest = RandomForestClassifier()

    st = time.time()
    x_over, y_over = oversample.fit_resample(data_train_SMOTE.drop(["5"], axis=1), data_train_SMOTE.drop(data_train_SMOTE.columns[0:-1],axis=1))
    elapsed_time_SMOTE = time.time() - st

    st = time.time()
    input_x_over_AIS, y_over_AIS = oversample_AIS.AIS_Resample(data_train_AIS.drop(["5"], axis=1), data_train_AIS.drop(data_train_AIS.columns[0:-1],axis=1), 20, 5, randomForest,5,'balanced_accuracy',min_change = 0.005, use_lof = True)
    elapsed_time_AIS = time.time() - st

    smote_df = pd.concat([x_over, y_over], axis=1)
    ais_df = pd.concat([input_x_over_AIS, y_over_AIS], axis=1)

    # print the dimensionality of the oversampled dataset
    print(f"SMOTE Oversampled Data shape: \n{smote_df.shape}\n")
    print(f"SMOTE Oversampled Data size: \n{smote_df.size}\n")
    print(f"SMOTE Oversampled Data ndim: \n{smote_df.ndim}\n")
    print("_____________________________________________\n")

    # print the dimensionality of the oversampled dataset
    print(f"AIS Oversampled Data shape: \n{ais_df.shape}\n")
    print(f"AIS Oversampled Data size: \n{ais_df.size}\n")
    print(f"AIS Oversampled Data ndim: \n{ais_df.ndim}\n")
    print("_____________________________________________\n")


    # print the new class distribution using a Counter
    print(f"New SMOTE Class Distribution: {Counter(smote_df['5'])}")
    print(f"New AIS Class Distribution: {Counter(ais_df['5'])}")
    # print the new class distribution using a Counter
    print(f"Old Class Distribution: {Counter(data_train['5'])}")

    print("_____________________________________________\n")

    #labelTrainFlat = labels_train.values.ravel()

    #Fit one vs rest Gradient Boosting classification
    gradientBoosting = GradientBoostingClassifier()
    gradientBoosting = gradientBoosting.fit(x_over, y_over.values.ravel())

    gradientBoosting_AIS = GradientBoostingClassifier()
    gradientBoosting_AIS = gradientBoosting.fit(input_x_over_AIS, y_over_AIS.values.ravel())

    #gradientBoosting_Base = gradientBoosting.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1]).values.ravel())

    #Fit RandomForestClassifier classification
    randomForest = RandomForestClassifier()
    randomForest = randomForest.fit(x_over,y_over.values.ravel())

    randomForest = RandomForestClassifier()
    randomForest_AIS  = randomForest.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #randomForest_Base  = randomForest.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1]).values.ravel())

    #Create a KNeighbors classification object
    kNeighbors = KNeighborsClassifier()
    kNeighbors = kNeighbors.fit(x_over,y_over.values.ravel())

    kNeighbors = KNeighborsClassifier()
    kNeighbors_AIS  = kNeighbors.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #kNeighbors_base  = kNeighbors.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1]).values.ravel())

    #Create an LogisticRegression object
    logisticRegression = LogisticRegression(max_iter=5000)
    logisticRegression = logisticRegression.fit(x_over,y_over.values.ravel())

    logisticRegression = LogisticRegression(max_iter=5000)
    logisticRegression_AIS  = logisticRegression.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #logisticRegression_Base  = logisticRegression.fit(data_train.drop(["5"], axis=1), data_train.drop(data_train_AIS.columns[0:-1]).values.ravel())

    #Set the parameters of GradientBoosting for GridSearchCV
    parametersGradientBoosting = [
        {'learning_rate': [0.44,0.45,0.46],'min_samples_leaf': [5,6,7],'min_samples_split': [7,8,9,10], 'n_estimators': [57,58,59,60]}
    ]

    #Set the scoring parameters
    scoringX = {"roc_auc": "roc_auc", "bal_accuracy": "balanced_accuracy"}

    #Preform Gridsearch to find best parameters
    grid_searchGradientBoosting = GridSearchCV(gradientBoosting, parametersGradientBoosting, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchGradientBoosting_AIS = GridSearchCV(gradientBoosting_AIS, parametersGradientBoosting, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    #grid_searchGradientBoosting_Base = GridSearchCV(gradientBoosting_Base, parametersGradientBoosting, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    #Fit the GradientBoosting 
    grid_searchGradientBoosting.fit(x_over, y_over.values.ravel())
    grid_searchGradientBoosting_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())
    

    #Print GridSearchCV Results
    
    print(f"Best score GradientBoosting: \n{grid_searchGradientBoosting.best_score_}\n")
    print("_____________________________________________\n")
    
    print(f"Best score GradientBoosting AIS: \n{grid_searchGradientBoosting_AIS.best_score_}\n")

    #Set the parameters of KNeighbors for GridSearchCV
    parametersKNeighbors = [
        {'n_neighbors': [1,2,3],'weights':['uniform', 'distance'],'algorithm':['auto'], 'p': [1,2,3]}
    ]

    #Set the scoring parameters
    scoringX = {"roc_auc": "roc_auc", "bal_accuracy": "balanced_accuracy"}

    #Preform KNeighbors to find best parameters
    grid_searchKNeighbors = GridSearchCV(kNeighbors, parametersKNeighbors, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchKNeighbors_AIS = GridSearchCV(kNeighbors_AIS, parametersKNeighbors, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

    #Fit the KNeighbors 
    grid_searchKNeighbors.fit(x_over, y_over.values.ravel())
    grid_searchKNeighbors_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #Print GridSearchCV Results
   
    print(f"Best score KNeighbors: \n{grid_searchKNeighbors.best_score_}\n")
    print("_____________________________________________\n")
   
    print(f"Best score KNeighbors AIS: \n{grid_searchKNeighbors_AIS.best_score_}\n")
    
    #Set the parameters of LogisticRegression for GridSearchCV
    parametersLogisticRegression = [
        {'multi_class': ['ovr'],'penalty':['none','l2'], 'C': [1,2,3]}
    ]
    scoringX = {"roc_auc": "roc_auc", "bal_accuracy": "balanced_accuracy"}

    #Preform LogisticRegression to find best parameters
    grid_searchLogisticRegression = GridSearchCV(logisticRegression, parametersLogisticRegression, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchLogisticRegression_AIS = GridSearchCV(logisticRegression_AIS, parametersLogisticRegression, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

    #Fit the LogisticRegression 
    grid_searchLogisticRegression.fit(x_over, y_over.values.ravel())
    grid_searchLogisticRegression_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #Print LogisticRegression Results
    
    print(f"Best score Logistic Regression: \n{grid_searchLogisticRegression.best_score_}\n")
    print("_____________________________________________\n")
   
    print(f"Best score Logistic Regression AIS: \n{grid_searchLogisticRegression_AIS.best_score_}\n")

    #Set the parameters of RandomForest for GridSearchCV
    parametersRandomForest = [
        {'n_estimators': [145,150,155,190],'max_depth': [10,12], 'bootstrap': [True, False],
        'min_samples_split': [0.05,2], 'max_features': ['auto']}
    ]

    #Preform Gridsearch to find best parameters
    grid_searchRandomForest = GridSearchCV(randomForest, parametersRandomForest, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')
    grid_searchRandomForest_AIS = GridSearchCV(randomForest_AIS, parametersRandomForest, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

    #Fit the RandomForest 
    grid_searchRandomForest.fit(x_over, y_over.values.ravel())
    grid_searchRandomForest_AIS.fit(input_x_over_AIS,y_over_AIS.values.ravel())

    #Print GridSearchCV Results

    print(f"Best score RandomForest: \n{grid_searchRandomForest.best_score_}\n")

    print(f"Best score RandomForest AIS: \n{grid_searchRandomForest_AIS.best_score_}\n")

    #Get the results for all classifiers 
    cross_val_resultsGB = grid_searchGradientBoosting.cv_results_
    cross_val_resultsRF = grid_searchRandomForest.cv_results_
    cross_val_resultsLR = grid_searchLogisticRegression.cv_results_
    cross_val_resultsKN = grid_searchKNeighbors.cv_results_

    cross_val_resultsGB_AIS = grid_searchGradientBoosting_AIS.cv_results_
    cross_val_resultsRF_AIS = grid_searchRandomForest_AIS.cv_results_
    cross_val_resultsLR_AIS = grid_searchLogisticRegression_AIS.cv_results_
    cross_val_resultsKN_AIS = grid_searchKNeighbors_AIS.cv_results_


    #Print the results of all classiifiers
    #GBC
    mean_test_roc_aucGB = mean(cross_val_resultsGB['mean_test_roc_auc'])
    mean_test_bal_accuracyGB = mean(cross_val_resultsGB['mean_test_bal_accuracy'])
    
    mean_test_roc_aucGB_AIS = mean(cross_val_resultsGB_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyGB_AIS = mean(cross_val_resultsGB_AIS['mean_test_bal_accuracy'])
   
    #RFC
    mean_test_roc_aucRF = mean(cross_val_resultsRF['mean_test_roc_auc'])
    mean_test_bal_accuracyRF = mean(cross_val_resultsRF['mean_test_bal_accuracy'])
    
    mean_test_roc_aucRF_AIS = mean(cross_val_resultsRF_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyRF_AIS = mean(cross_val_resultsRF_AIS['mean_test_bal_accuracy'])
    #LRC
    mean_test_roc_aucLR = mean(cross_val_resultsLR['mean_test_roc_auc'])
    mean_test_bal_accuracyLR = mean(cross_val_resultsLR['mean_test_bal_accuracy'])
    
    mean_test_roc_aucLR_AIS = mean(cross_val_resultsLR_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyLR_AIS = mean(cross_val_resultsLR_AIS['mean_test_bal_accuracy'])

    #KNC
    mean_test_roc_aucKN = mean(cross_val_resultsKN['mean_test_roc_auc'])
    mean_test_bal_accuracyKN = mean(cross_val_resultsKN['mean_test_bal_accuracy'])
    
    mean_test_roc_aucKN_AIS = mean(cross_val_resultsKN_AIS['mean_test_roc_auc'])
    mean_test_bal_accuracyKN_AIS = mean(cross_val_resultsKN_AIS['mean_test_bal_accuracy'])

    predictions_test_over_GB = grid_searchGradientBoosting.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_RF = grid_searchRandomForest.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_LR = grid_searchLogisticRegression.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_KN = grid_searchKNeighbors.best_estimator_.predict(data_test.drop(["5"],axis=1))

    predictions_test_over_GB_AIS = grid_searchGradientBoosting_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_RF_AIS = grid_searchRandomForest_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_LR_AIS = grid_searchLogisticRegression_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))
    predictions_test_over_KN_AIS = grid_searchKNeighbors_AIS.best_estimator_.predict(data_test.drop(["5"],axis=1))

    geometric_mean_score_GB = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_RF = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_LR = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_KN = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN, labels=None, pos_label=1, average='binary',)

    geometric_mean_score_GB_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_GB_AIS, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_RF_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_RF_AIS, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_LR_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_LR_AIS, labels=None, pos_label=1, average='binary',)
    geometric_mean_score_KN_AIS = geometric_mean_score(data_test.drop(data_test.columns[0:-1],axis=1), predictions_test_over_KN_AIS, labels=None, pos_label=1, average='binary',)

    col_names = ["Base Score", "Oversample","Oversample Run Time", "Oversample Paramaters", "Mean Test ROC Accuracy for Gradient Boosting", "Balanced Test Accuracy for Gradient Boosting", "Mean Test ROC Accuracy for Random Forests", "Balanced Test Accuracy for Random Forests" , "Mean Test ROC Accuracy for Logistic Regression", "Balanced Test Accuracy for Logistic Regression", "Mean Test ROC Accuracy for K Nearest Neighbours", "Balanced Test Accuracy for K Nearest Neighbours", "Geometric Mean Score for Gradient Boosting", "Geometric Mean Score for Random Forest", "Geometric Mean Score for Logestic Regression", "Geometric Mean Score for K Neighbors"]
    dataAIS = ["0","AIS", elapsed_time_AIS,"LOF, MinChange = 0.005", mean_test_roc_aucGB_AIS, mean_test_bal_accuracyGB_AIS, mean_test_roc_aucRF_AIS, mean_test_roc_aucRF_AIS, mean_test_roc_aucLR_AIS, mean_test_bal_accuracyLR_AIS, mean_test_roc_aucKN_AIS, mean_test_bal_accuracyKN_AIS, geometric_mean_score_GB_AIS, geometric_mean_score_RF_AIS, geometric_mean_score_LR_AIS, geometric_mean_score_KN_AIS]
    dataSMOTE = ["0","SMOTE", elapsed_time_SMOTE,"NA", mean_test_roc_aucGB, mean_test_bal_accuracyGB, mean_test_roc_aucRF, mean_test_bal_accuracyRF, mean_test_roc_aucLR, mean_test_bal_accuracyLR, mean_test_roc_aucKN, mean_test_bal_accuracyKN, geometric_mean_score_GB, geometric_mean_score_RF, geometric_mean_score_LR, geometric_mean_score_KN]
    data = [dataAIS,dataSMOTE]

    dfoutput=pd.DataFrame(data,columns=col_names)
    print(dfoutput)
    title = "Data/ExperamentalComparisons - Dec 04.csv"
    dfoutput.to_csv(title, mode='a') 

Train Data shape: 
            0         1         2         3         4    5
60   0.062228 -0.182247  0.996821 -0.073843 -0.170173  0.0
61   0.079536 -0.232938 -0.574557 -0.094382 -0.217506  0.0
62  -0.037663  0.110303 -1.411624  0.044693  0.102996  0.0
63  -0.048642  0.142458  0.976490  0.057721  0.133021  0.0
64   0.148881 -0.436029  1.447637 -0.176671 -0.407143  0.0
..        ...       ...       ...       ...       ...  ...
295  0.083178 -0.243602  0.357564 -0.098703 -0.227464  0.0
296  0.053356 -0.156265 -2.764434 -0.063316 -0.145913  0.0
297  0.146055 -0.427751 -1.035479 -0.173317 -0.399413  0.0
298 -0.006978  0.020435  1.891368  0.008280  0.019081  0.0
299  0.075849 -0.222139 -0.515618 -0.090007 -0.207423  0.0

[240 rows x 6 columns]

Test Data shape: 
           0         1         2         3         4    5
0   0.102429 -0.299983 -1.282116 -0.121547 -0.280109  0.0
1   0.058071 -0.170074  1.942940 -0.068911 -0.158807  0.0
2  -0.029416  0.086150  0.477215  0.034906  0.080443  0.

  warn(
  warn(


Best score RandomForest: 
0.7141326530612244

Best score RandomForest AIS: 
0.9082142857142856

  Base Score Oversample  Oversample Run Time   Oversample Paramaters  \
0          0        AIS            64.681926  LOF, MinChange = 0.005   
1          0      SMOTE             0.005000                      NA   

   Mean Test ROC Accuracy for Gradient Boosting  \
0                                      0.919044   
1                                      0.810897   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.871384   
1                                      0.740406   

   Mean Test ROC Accuracy for Random Forests  \
0                                   0.928984   
1                                   0.791862   

   Balanced Test Accuracy for Random Forests  \
0                                   0.928984   
1                                   0.791862   

   Mean Test ROC Accuracy for Logistic Regression  \
0                                   

  warn(
  warn(


Best score RandomForest: 
0.7217261904761905

Best score RandomForest AIS: 
0.8960990646258504

  Base Score Oversample  Oversample Run Time   Oversample Paramaters  \
0          0        AIS            79.855498  LOF, MinChange = 0.005   
1          0      SMOTE             0.006066                      NA   

   Mean Test ROC Accuracy for Gradient Boosting  \
0                                      0.922845   
1                                      0.821707   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.860536   
1                                      0.738486   

   Mean Test ROC Accuracy for Random Forests  \
0                                   0.904745   
1                                   0.800554   

   Balanced Test Accuracy for Random Forests  \
0                                   0.904745   
1                                   0.800554   

   Mean Test ROC Accuracy for Logistic Regression  \
0                                   

  warn(
  warn(


Best score RandomForest: 
0.7525

Best score RandomForest AIS: 
0.9124999999999999

  Base Score Oversample  Oversample Run Time   Oversample Paramaters  \
0          0        AIS            68.079233  LOF, MinChange = 0.005   
1          0      SMOTE             0.003999                      NA   

   Mean Test ROC Accuracy for Gradient Boosting  \
0                                      0.941410   
1                                      0.872089   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.872396   
1                                      0.770365   

   Mean Test ROC Accuracy for Random Forests  \
0                                   0.940927   
1                                   0.845889   

   Balanced Test Accuracy for Random Forests  \
0                                   0.940927   
1                                   0.845889   

   Mean Test ROC Accuracy for Logistic Regression  \
0                                        0.76146

  warn(
  warn(


Best score RandomForest: 
0.7401960784313726

Best score RandomForest AIS: 
0.8995098039215685

  Base Score Oversample  Oversample Run Time   Oversample Paramaters  \
0          0        AIS            64.090479  LOF, MinChange = 0.005   
1          0      SMOTE             0.005976                      NA   

   Mean Test ROC Accuracy for Gradient Boosting  \
0                                      0.935442   
1                                      0.808435   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.871630   
1                                      0.723465   

   Mean Test ROC Accuracy for Random Forests  \
0                                   0.938936   
1                                   0.810289   

   Balanced Test Accuracy for Random Forests  \
0                                   0.938936   
1                                   0.810289   

   Mean Test ROC Accuracy for Logistic Regression  \
0                                   

  warn(
  warn(


Best score RandomForest: 
0.7395833333333333

Best score RandomForest AIS: 
0.8984375

  Base Score Oversample  Oversample Run Time   Oversample Paramaters  \
0          0        AIS           141.449303  LOF, MinChange = 0.005   
1          0      SMOTE             0.005745                      NA   

   Mean Test ROC Accuracy for Gradient Boosting  \
0                                      0.908013   
1                                      0.797010   

   Balanced Test Accuracy for Gradient Boosting  \
0                                      0.844184   
1                                      0.714500   

   Mean Test ROC Accuracy for Random Forests  \
0                                   0.899972   
1                                   0.796046   

   Balanced Test Accuracy for Random Forests  \
0                                   0.899972   
1                                   0.796046   

   Mean Test ROC Accuracy for Logistic Regression  \
0                                        0.46

In [27]:
#aisOversample = ArtificialImmuneSystem()
#minority_class = df[df['5'] == 1]
#majority_class = df[df['5'] == 0]

#requiredPopulation = len(majority_class)-len(minority_class)
#population = aisOversample.AIS(minority_class, max_rounds=100, totalPopulation=requiredPopulation)


In [28]:
#Extracting Labels
#Get a list of all columns
#columns = data_train.columns.to_list()
#Remove the label and save it
#columns_drop = columns.pop(-1)

#Remove all labels except for the label in the train and test dataframe
#labels_train = data_train.drop(columns, axis=1)
#labels_test = data_test.drop(columns, axis=1)

#Print the labesl of the test and train
#print(f"labels_train: \n{labels_train}\n")
#print(f"labels_test: \n{labels_test}\n")

#Remove the label from the train and test dataframe
#features_train = data_train.drop(['5'], axis=1)
#features_test = data_test.drop(['5'], axis=1)

#Print the features of the train and test dataset
#print(f"features_train: \n{features_train }\n")
#print(f"lfeatures_test: \n{features_test }\n")