In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from imblearn.over_sampling import SMOTE, RandomOverSampler
from collections import Counter
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV, cross_validate

# importing two different imputation methods that take into consideration all the features when predicting the missing values
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#multiclass imports
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.dummy import DummyClassifier #Will identify the maority calss base line, model needs to do better then the baseline

from statistics import mean
# to reduce randomness then you put the seed
np.random.seed(42)

from ArtificialImmuneSystem import *

In [28]:
df = pd.read_csv('Data\GeneratedSyntheticData-NoiselessInformativeEasy.csv')
df = df.drop('Unnamed: 0', axis=1)

In [29]:
print(f"Data shape: \n{df.shape}\n")
print(f"Data size: \n{df.size}\n")
print(f"Data ndim: \n{df.ndim}\n")
print("_____________________________________________\n")
print(f"Old Class Distribution: {Counter(df['5'])}")

Data shape: 
(300, 6)

Data size: 
1800

Data ndim: 
2

_____________________________________________

Old Class Distribution: Counter({0.0: 210, 1.0: 90})


In [30]:
#Create an oversampling object
oversample = SMOTE()
#Oversample and add to the dataframe to fix the class imbalance
x_over, y_over = oversample.fit_resample(df.drop(["5"], axis=1), df.drop(df.columns[0:-1],axis=1))
smote_df = pd.concat([x_over, y_over], axis=1)

# print the dimensionality of the oversampled dataset
print(f"Oversampled Data shape: \n{df.shape}\n")
print(f"Oversampled Data size: \n{df.size}\n")
print(f"Oversampled Data ndim: \n{df.ndim}\n")
print("_____________________________________________\n")


# print the new class distribution using a Counter
print(f"New Class Distribution: {Counter(smote_df['5'])}")

print("_____________________________________________\n")

Oversampled Data shape: 
(300, 6)

Oversampled Data size: 
1800

Oversampled Data ndim: 
2

_____________________________________________

New Class Distribution: Counter({0.0: 210, 1.0: 210})
_____________________________________________



In [31]:
#aisOversample = ArtificialImmuneSystem()
#minority_class = df[df['5'] == 1]
#majority_class = df[df['5'] == 0]

#requiredPopulation = len(majority_class)-len(minority_class)
#population = aisOversample.AIS(minority_class, max_rounds=100, totalPopulation=requiredPopulation)


In [32]:
#Split the dataset into a train set = 80% and test = 20%
data_train, data_test = train_test_split(df, test_size=0.2, random_state=42)

#Print the shape of the train and test set
print(f"Train Data shape: \n{data_train}\n")
print(f"Test Data shape: \n{data_test}\n")

Train Data shape: 
            0         1         2         3         4    5
232 -4.691545 -4.897106 -7.574577 -6.690183 -4.943022  0.0
59  -5.540908 -3.826580 -5.718433 -5.464299 -7.812865  0.0
6   -5.558902 -4.469969 -4.521977 -1.187704 -3.489468  0.0
185  3.530166  6.453537  5.518635 -5.984399 -3.861131  1.0
173 -3.874927 -4.519557 -6.019036 -5.568689 -4.379450  0.0
..        ...       ...       ...       ...       ...  ...
188 -4.382753 -4.156182 -4.945608 -4.251797 -4.573749  0.0
71  -4.917879 -5.336254 -5.289513 -8.989536 -6.432483  0.0
106  4.376757  7.063312  6.773084 -5.074527 -4.982809  1.0
270 -5.132905 -5.448679 -3.954188 -3.423394 -2.365563  0.0
102 -5.199458 -4.936755 -6.149472 -5.588318 -4.449639  0.0

[240 rows x 6 columns]

Test Data shape: 
            0         1         2         3         4    5
203 -5.452907 -4.913225 -3.880859 -4.158012 -4.186854  0.0
266 -5.466881 -6.877064 -6.090096 -4.461881 -4.315136  0.0
152  4.699258  5.361881  6.027668 -4.969582 -4.990885

In [33]:
#Extracting Labels
#Get a list of all columns
columns = data_train.columns.to_list()
#Remove the label and save it
columns_drop = columns.pop(-1)

#Remove all labels except for the label in the train and test dataframe
labels_train = data_train.drop(columns, axis=1)
labels_test = data_test.drop(columns, axis=1)

#Print the labesl of the test and train
print(f"labels_train: \n{labels_train}\n")
print(f"labels_test: \n{labels_test}\n")

#Remove the label from the train and test dataframe
features_train = data_train.drop(['5'], axis=1)
features_test = data_test.drop(['5'], axis=1)

#Print the features of the train and test dataset
print(f"features_train: \n{features_train }\n")
print(f"lfeatures_test: \n{features_test }\n")

labels_train: 
       5
232  0.0
59   0.0
6    0.0
185  1.0
173  0.0
..   ...
188  0.0
71   0.0
106  1.0
270  0.0
102  0.0

[240 rows x 1 columns]

labels_test: 
       5
203  0.0
266  0.0
152  1.0
9    0.0
233  1.0
226  1.0
196  1.0
109  1.0
5    0.0
175  0.0
237  1.0
57   0.0
218  0.0
45   0.0
182  0.0
221  0.0
289  0.0
211  0.0
148  0.0
165  0.0
78   0.0
113  1.0
249  0.0
250  1.0
104  0.0
42   1.0
281  1.0
295  0.0
157  0.0
238  0.0
17   1.0
164  0.0
33   0.0
24   1.0
215  1.0
119  0.0
7    0.0
90   0.0
46   0.0
73   0.0
93   0.0
76   0.0
286  1.0
60   1.0
77   0.0
63   0.0
234  1.0
229  0.0
111  0.0
231  0.0
180  0.0
144  0.0
239  1.0
75   0.0
297  0.0
278  0.0
97   0.0
92   0.0
192  0.0
25   0.0

features_train: 
            0         1         2         3         4
232 -4.691545 -4.897106 -7.574577 -6.690183 -4.943022
59  -5.540908 -3.826580 -5.718433 -5.464299 -7.812865
6   -5.558902 -4.469969 -4.521977 -1.187704 -3.489468
185  3.530166  6.453537  5.518635 -5.984399 -3.861131
1

In [34]:
labelTrainFlat = labels_train.values.ravel()

#Fit one vs rest Gradient Boosting classification
gradientBoosting = GradientBoostingClassifier()
gradientBoosting = gradientBoosting.fit(features_train, labelTrainFlat)

#Fit RandomForestClassifier classification
randomForest = RandomForestClassifier()
randomForest = randomForest.fit(features_train,labelTrainFlat)

#Create a KNeighbors classification object
kNeighbors = KNeighborsClassifier()
kNeighbors = kNeighbors.fit(features_train,labelTrainFlat)

#Create an LogisticRegression object
logisticRegression = LogisticRegression(max_iter=5000)
logisticRegression = logisticRegression.fit(features_train,labelTrainFlat)

In [35]:
#Set the parameters of GradientBoosting for GridSearchCV
parametersGradientBoosting = [
    {'learning_rate': [0.44,0.45,0.46],'min_samples_leaf': [5,6,7],'min_samples_split': [7,8,9,10], 'n_estimators': [57,58,59,60]}
]

#Set the scoring parameters
scoringX = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}

#Preform Gridsearch to find best parameters
grid_searchGradientBoosting = GridSearchCV(gradientBoosting, parametersGradientBoosting, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

#Fit the GradientBoosting 
grid_searchGradientBoosting.fit(features_train, labelTrainFlat)

#Print GridSearchCV Results
print(f"Best parameters GradientBoosting: \n{grid_searchGradientBoosting.best_params_}\n")
print(f"Best estimator GradientBoosting: \n{grid_searchGradientBoosting.best_estimator_}\n")
print(f"Best score GradientBoosting: \n{grid_searchGradientBoosting.best_score_}\n")

Best parameters GradientBoosting: 
{'learning_rate': 0.44, 'min_samples_leaf': 5, 'min_samples_split': 7, 'n_estimators': 57}

Best estimator GradientBoosting: 
GradientBoostingClassifier(learning_rate=0.44, min_samples_leaf=5,
                           min_samples_split=7, n_estimators=57)

Best score GradientBoosting: 
1.0



In [36]:


#Set the parameters of KNeighbors for GridSearchCV
parametersKNeighbors = [
    {'n_neighbors': [1,2,3],'weights':['uniform', 'distance'],'algorithm':['auto'], 'p': [1,2,3]}
]

#Set the scoring parameters
scoringX = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}

#Preform KNeighbors to find best parameters
grid_searchKNeighbors = GridSearchCV(kNeighbors, parametersKNeighbors, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

#Fit the KNeighbors 
grid_searchKNeighbors.fit(features_train, labelTrainFlat)

#Print GridSearchCV Results
print(f"Best parameters KNeighbors: \n{grid_searchKNeighbors.best_params_}\n")
print(f"Best estimator KNeighbors: \n{grid_searchKNeighbors.best_estimator_}\n")
print(f"Best score KNeighbors: \n{grid_searchKNeighbors.best_score_}\n")

Best parameters KNeighbors: 
{'algorithm': 'auto', 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}

Best estimator KNeighbors: 
KNeighborsClassifier(n_neighbors=1, p=1)

Best score KNeighbors: 
1.0



In [37]:


#Set the parameters of LogisticRegression for GridSearchCV
parametersLogisticRegression = [
    {'multi_class': ['ovr'],'penalty':['none','l2'], 'C': [1,2,3]}
]
scoringX = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}

#Preform LogisticRegression to find best parameters
grid_searchLogisticRegression = GridSearchCV(logisticRegression, parametersLogisticRegression, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

#Fit the LogisticRegression 
grid_searchLogisticRegression.fit(features_train, labelTrainFlat)

#Print LogisticRegression Results
print(f"Best parameters Logistic Regression: \n{grid_searchLogisticRegression.best_params_}\n")
print(f"Best estimator Logistic Regression: \n{grid_searchLogisticRegression.best_estimator_}\n")
print(f"Best score Logistic Regression: \n{grid_searchLogisticRegression.best_score_}\n")

Best parameters Logistic Regression: 
{'C': 1, 'multi_class': 'ovr', 'penalty': 'none'}

Best estimator Logistic Regression: 
LogisticRegression(C=1, max_iter=5000, multi_class='ovr', penalty='none')

Best score Logistic Regression: 
1.0



In [39]:
#Set the parameters of RandomForest for GridSearchCV
parametersRandomForest = [
    {'n_estimators': [145,150,155,190],'max_depth': [10,12], 'bootstrap': [True, False],
     'min_samples_split': [0.05,2], 'max_features': ['auto']}
]

#Preform Gridsearch to find best parameters
grid_searchRandomForest = GridSearchCV(randomForest, parametersRandomForest, cv=4, scoring = scoringX, return_train_score=True, n_jobs=-1, refit='bal_accuracy')

#Fit the RandomForest 
grid_searchRandomForest.fit(features_train, labelTrainFlat)

#Print GridSearchCV Results
print(f"Best parameters RandomForest: \n{grid_searchRandomForest.best_params_}\n")
print(f"Best estimator RandomForest: \n{grid_searchRandomForest.best_estimator_}\n")
print(f"Best score RandomForest: \n{grid_searchRandomForest.best_score_}\n")

Best parameters RandomForest: 
{'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'min_samples_split': 0.05, 'n_estimators': 145}

Best estimator RandomForest: 
RandomForestClassifier(max_depth=10, max_features='auto',
                       min_samples_split=0.05, n_estimators=145)

Best score RandomForest: 
1.0



  warn(


In [40]:
#Get the results for all classifiers 
cross_val_resultsGB = grid_searchGradientBoosting.cv_results_
cross_val_resultsRF = grid_searchRandomForest.cv_results_
cross_val_resultsLR = grid_searchLogisticRegression.cv_results_
cross_val_resultsKN = grid_searchKNeighbors.cv_results_


#Print the results of all classiifiers
#GBC
print(f"Mean Test Accuracy for Gradient Boosting: \n{mean(cross_val_resultsGB['mean_test_accuracy'])}\n")
print(f"Balanced Test Accuracy for Gradient Boosting: \n{mean(cross_val_resultsGB['mean_test_bal_accuracy'])}\n")
print(f"Mean F1 Macro for Gradient Boosting: \n{mean(cross_val_resultsGB['mean_test_F1_macro'])}\n")

#RFC
print(f"Mean Test Accuracy for Random Forests: \n{mean(cross_val_resultsRF['mean_test_accuracy'])}\n")
print(f"Balanced Test Accuracy for Random Forests: \n{mean(cross_val_resultsRF['mean_test_bal_accuracy'])}\n")
print(f"Mean F1 Macro for Random Forests: \n{mean(cross_val_resultsRF['mean_test_F1_macro'])}\n")

#LRC
print(f"Mean Test Accuracy for Logistic Regression: \n{mean(cross_val_resultsLR['mean_test_accuracy'])}\n")
print(f"Balanced Test Accuracy for Logistic Regression: \n{mean(cross_val_resultsLR['mean_test_bal_accuracy'])}\n")
print(f"Mean F1 Macro for Logistic Regression: \n{mean(cross_val_resultsLR['mean_test_F1_macro'])}\n")

#KNC
print(f"Mean Test Accuracy for K Nearest Neighbours: \n{mean(cross_val_resultsKN['mean_test_accuracy'])}\n")
print(f"Balanced Test Accuracy for K Nearest Neighbours: \n{mean(cross_val_resultsKN['mean_test_bal_accuracy'])}\n")
print(f"Mean F1 Macro for Logistic K Nearest Neighbours: \n{mean(cross_val_resultsKN['mean_test_F1_macro'])}\n")

Mean Test Accuracy for Gradient Boosting: 
1.0

Balanced Test Accuracy for Gradient Boosting: 
1.0

Mean F1 Macro for Gradient Boosting: 
1.0

Mean Test Accuracy for Random Forests: 
1.0

Balanced Test Accuracy for Random Forests: 
1.0

Mean F1 Macro for Random Forests: 
1.0

Mean Test Accuracy for Logistic Regression: 
1.0

Balanced Test Accuracy for Logistic Regression: 
1.0

Mean F1 Macro for Logistic Regression: 
1.0

Mean Test Accuracy for K Nearest Neighbours: 
1.0

Balanced Test Accuracy for K Nearest Neighbours: 
1.0

Mean F1 Macro for Logistic K Nearest Neighbours: 
1.0

