1. Random Forest classification

In [2]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

data = pd.read_csv('ds3_normalized.csv')
X = data[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AJ', 'AK', 'AL', 'AM', 'AN', 'AO', 'AP', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AV', 'AW', 'AX', 'AY', 'AZ', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BK', 'BL', 'BM', 'BN', 'BO', 'BP', 'BQ', 'BR']]
y = data['BS']

# Convert target variable to a one-dimensional array
y = y.ravel()

# Initialize the MLP Classifier
clf = MLPClassifier(hidden_layer_sizes=(20, 10), random_state=5, verbose=True, learning_rate_init=0.01)

# Define the number of folds (k)
num_folds = 10

# Initialize KFold with the specified number of folds
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracies = []
sensitivities = []
specificities = []
precisions = []
recalls = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Evaluate the model and calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    tp = np.sum((y_pred == 1) & (y_test == 1))
    tn = np.sum((y_pred == -1) & (y_test == -1))
    fp = np.sum((y_pred == -1) & (y_test == 1))
    fn = np.sum((y_pred == 1) & (y_test == -1))
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    accuracies.append(accuracy)
    sensitivities.append(sensitivity)
    specificities.append(specificity)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)

# Calculate the average metrics across all folds
mean_accuracy = np.mean(accuracies)
mean_sensitivity = np.mean(sensitivities)
mean_specificity = np.mean(specificities)
mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)
mean_f1_score = np.mean(f1_scores)

print('Mean accuracy =', mean_accuracy)
print('Mean sensitivity =', mean_sensitivity)
print('Mean specificity =', mean_specificity)
print('Mean precision =', mean_precision)
print('Mean recall =', mean_recall)
print('Mean f1-score =', mean_f1_score)


Iteration 1, loss = 0.05838202
Iteration 2, loss = 0.03388066
Iteration 3, loss = 0.02810062
Iteration 4, loss = 0.03048016
Iteration 5, loss = 0.02682925
Iteration 6, loss = 0.02729952
Iteration 7, loss = 0.02411756
Iteration 8, loss = 0.02716487
Iteration 9, loss = 0.02309559
Iteration 10, loss = 0.02251507
Iteration 11, loss = 0.02223553
Iteration 12, loss = 0.02275422
Iteration 13, loss = 0.02096732
Iteration 14, loss = 0.01999285
Iteration 15, loss = 0.02105753
Iteration 16, loss = 0.02177573
Iteration 17, loss = 0.01987187
Iteration 18, loss = 0.01911567
Iteration 19, loss = 0.02065341
Iteration 20, loss = 0.01965351
Iteration 21, loss = 0.01917630
Iteration 22, loss = 0.01934509
Iteration 23, loss = 0.01924165
Iteration 24, loss = 0.01923527
Iteration 25, loss = 0.01992322
Iteration 26, loss = 0.01954303
Iteration 27, loss = 0.01881877
Iteration 28, loss = 0.01886583
Iteration 29, loss = 0.01948347
Iteration 30, loss = 0.01846505
Iteration 31, loss = 0.01806314
Iteration 32, los

Iteration 62, loss = 0.01599423
Iteration 63, loss = 0.01514558
Iteration 64, loss = 0.01736880
Iteration 65, loss = 0.01550232
Iteration 66, loss = 0.01613183
Iteration 67, loss = 0.01527053
Iteration 68, loss = 0.01548325
Iteration 69, loss = 0.01578373
Iteration 70, loss = 0.01661978
Iteration 71, loss = 0.01562612
Iteration 72, loss = 0.01578155
Iteration 73, loss = 0.01476794
Iteration 74, loss = 0.01525808
Iteration 75, loss = 0.01638250
Iteration 76, loss = 0.01561332
Iteration 77, loss = 0.01511508
Iteration 78, loss = 0.01557894
Iteration 79, loss = 0.01523463
Iteration 80, loss = 0.01477837
Iteration 81, loss = 0.01499128
Iteration 82, loss = 0.01517547
Iteration 83, loss = 0.01536337
Iteration 84, loss = 0.01600868
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.05984926
Iteration 2, loss = 0.03823009
Iteration 3, loss = 0.03115289
Iteration 4, loss = 0.02867560
Iteration 5, loss = 0.02602245
Iteration 6, loss =

Iteration 20, loss = 0.02147838
Iteration 21, loss = 0.02359376
Iteration 22, loss = 0.02364367
Iteration 23, loss = 0.02292047
Iteration 24, loss = 0.02335331
Iteration 25, loss = 0.02177221
Iteration 26, loss = 0.02101720
Iteration 27, loss = 0.01951232
Iteration 28, loss = 0.02136317
Iteration 29, loss = 0.02159636
Iteration 30, loss = 0.02192984
Iteration 31, loss = 0.02176727
Iteration 32, loss = 0.02113155
Iteration 33, loss = 0.02171158
Iteration 34, loss = 0.01911291
Iteration 35, loss = 0.01783596
Iteration 36, loss = 0.01743676
Iteration 37, loss = 0.01797515
Iteration 38, loss = 0.02025816
Iteration 39, loss = 0.01815964
Iteration 40, loss = 0.01617043
Iteration 41, loss = 0.01749614
Iteration 42, loss = 0.01786424
Iteration 43, loss = 0.01728262
Iteration 44, loss = 0.02029719
Iteration 45, loss = 0.02011033
Iteration 46, loss = 0.02088515
Iteration 47, loss = 0.02162862
Iteration 48, loss = 0.02111137
Iteration 49, loss = 0.02060501
Iteration 50, loss = 0.01976423
Iteratio

In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('ds3_normalized.csv')
X = data[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AJ', 'AK', 'AL', 'AM', 'AN', 'AO', 'AP', 'AQ', 'AR', 'AS', 'AT', 'AU', 'AV', 'AW', 'AX', 'AY', 'AZ', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BK', 'BL', 'BM', 'BN', 'BO', 'BP', 'BQ', 'BR']]
y = data['BS']

# Convert target variable to a one-dimensional array
y = y.ravel()

# Initialize the Random Forest Classifier
forest = RandomForestClassifier(criterion='gini', n_estimators=5, random_state=1, n_jobs=2)

# Define the number of folds (k)
num_folds = 10

# Initialize KFold with the specified number of folds
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics
accuracies = []
sensitivities = []
specificities = []
precisions = []
recalls = []
f1_scores = []

# Perform k-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the model
    forest.fit(X_train, y_train)

    # Predict on the test set
    y_pred = forest.predict(X_test)

    # Evaluate the model and calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    tp = np.sum((y_pred == 1) & (y_test == 1))
    tn = np.sum((y_pred == -1) & (y_test == -1))
    fp = np.sum((y_pred == -1) & (y_test == 1))
    fn = np.sum((y_pred == 1) & (y_test == -1))
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    accuracies.append(accuracy)
    sensitivities.append(sensitivity)
    specificities.append(specificity)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score)

# Calculate the average metrics across all folds
mean_accuracy = np.mean(accuracies)
mean_sensitivity = np.mean(sensitivities)
mean_specificity = np.mean(specificities)
mean_precision = np.mean(precisions)
mean_recall = np.mean(recalls)
mean_f1_score = np.mean(f1_scores)

print('Mean accuracy =', mean_accuracy)
print('Mean sensitivity =', mean_sensitivity)
print('Mean specificity =', mean_specificity)
print('Mean precision =', mean_precision)
print('Mean recall =', mean_recall)
print('Mean f1-score =', mean_f1_score)


Mean accuracy = 0.9999113903300112
Mean sensitivity = 0.9999531699772826
Mean specificity = 0.9998568812983601
Mean precision = 0.9998905338079297
Mean recall = 0.9999531699772826
Mean f1-score = 0.9999218494370922


In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_decision_regions
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

data=pd.read_csv('ds3_normalized.csv')
X=data[['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','AA','AB','AC','AD','AE','AF','AG','AH','AI','AJ','AK','AL','AM','AN','AO','AP','AQ','AR','AS','AT','AU','AV','AW','AX','AY','AZ','BA','BB','BC','BD','BE','BF','BG','BH','BI','BJ','BK','BL','BM','BN','BO','BP','BQ','BR']]
y=data['BS']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) 

In [2]:
data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,BJ,BK,BL,BM,BN,BO,BP,BQ,BR,BS
0,-1.0,-1.0,-1.0,0.96315,-0.78457,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.88299,-0.61528,-0.5,-0.98798,-0.89144,-0.96915,-0.86827,-0.91658,-1
1,-1.0,-1.0,-1.0,0.87268,-0.89746,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.92415,-0.82435,-0.78,-1.0,-0.84727,-0.87931,-0.71333,-0.9153,-1
2,-1.0,-1.0,-1.0,0.99997,-0.79803,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.82964,-0.45823,-0.52,-1.0,-0.85864,-0.85558,-0.7,-0.90841,-1
3,-1.0,-1.0,-1.0,0.90047,-0.85293,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.9157,-0.60502,-0.572,-1.0,-0.85323,-0.89725,-0.74833,-0.9112,-1
4,-1.0,-1.0,-1.0,0.95589,-0.82393,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.82776,-0.6635,-0.612,-0.98987,-0.84832,-0.88457,-0.74667,-0.91184,-1


In [3]:
#---------------------------------------------------------- Random Forest Classification, Train
start = time.time()
forest = RandomForestClassifier(criterion='gini',
                                 n_estimators=5,
                                 random_state=1,
                                 n_jobs=2)

forest.fit(X_train, y_train)

end = time.time()
print('time of train = ' , end - start, ' sec')


time of train =  1.5369133949279785  sec


In [4]:
#--------------------------------------------------------- Random Forest Classification, Test
start = time.time()
y_pred = forest.predict(X_test)
end = time.time()
print('time of test = ' , end - start, ' sec')

#--------------------------------------------------------- Random Forest Classification, Evaluations
#print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

y_test=y_test.ravel()

time of test =  0.03931474685668945  sec


In [5]:
#ypred=ypred.ravel()

TP=0
TN=0
FP=0
FN=0

for i in range(0,len(y_test),1):
    if y_pred[i] == 1 and y_test[i] == 1:   TP=TP+1
    if y_pred[i] == -1 and y_test[i] == -1: TN=TN+1
    if y_pred[i] == -1 and y_test[i] == 1:  FP=FP+1
    if y_pred[i] == 1 and y_test[i] == -1:  FN=FN+1

print('TP = ' , TP)
print('TN = ' , TN)
print('FP = ' , FP)
print('FN = ' , FN)

accuracy=(TP+TN)/(TP+TN+FP+FN);
sensitivity=TP/(TP+FN)
specificity=TN/(TN+FP)
precision=TP/(TP+FP);
recall=TP/(TP+FN);
f1=2*(precision*recall)/(precision+recall);

print('accuracy = ' , accuracy)
print('sensitivity = ' , sensitivity)
print('specificity = ' , specificity)
print('precision = ' , precision)
print('recall = ' , recall)
print('f1-score = ' , f1)

TP =  12813
TN =  9753
FP =  4
FN =  1
accuracy =  0.9997784768065217
sensitivity =  0.9999219603558608
specificity =  0.9995900379214923
precision =  0.9996879144885699
recall =  0.9999219603558608
f1-score =  0.9998049237251765


2. MLP classification

In [7]:
import numpy as np
import pandas as pd
import time

data=pd.read_csv('ds3_normalized.csv')
X=data[['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','AA','AB','AC','AD','AE','AF','AG','AH','AI','AJ','AK','AL','AM','AN','AO','AP','AQ','AR','AS','AT','AU','AV','AW','AX','AY','AZ','BA','BB','BC','BD','BE','BF','BG','BH','BI','BJ','BK','BL','BM','BN','BO','BP','BQ','BR']]
y=data['BS']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)  

from sklearn.neural_network import MLPClassifier

start = time.time()
clf = MLPClassifier(hidden_layer_sizes=(20,10),
                    random_state=5,
                    verbose=True,
                    learning_rate_init=0.01)
# Fit data onto the model
clf.fit(X_train,y_train)

end = time.time()
print('time of train = ' , end - start, ' sec')

Iteration 1, loss = 0.06266024
Iteration 2, loss = 0.03729090
Iteration 3, loss = 0.03518242
Iteration 4, loss = 0.03210841
Iteration 5, loss = 0.03133717
Iteration 6, loss = 0.03036251
Iteration 7, loss = 0.02805827
Iteration 8, loss = 0.02585358
Iteration 9, loss = 0.02408878
Iteration 10, loss = 0.02449205
Iteration 11, loss = 0.02253264
Iteration 12, loss = 0.02202183
Iteration 13, loss = 0.02257307
Iteration 14, loss = 0.02145698
Iteration 15, loss = 0.02116277
Iteration 16, loss = 0.02109177
Iteration 17, loss = 0.02032637
Iteration 18, loss = 0.02066371
Iteration 19, loss = 0.02156367
Iteration 20, loss = 0.01993838
Iteration 21, loss = 0.01907146
Iteration 22, loss = 0.01908540
Iteration 23, loss = 0.01913366
Iteration 24, loss = 0.01864884
Iteration 25, loss = 0.01773710
Iteration 26, loss = 0.01897369
Iteration 27, loss = 0.01884766
Iteration 28, loss = 0.01889274
Iteration 29, loss = 0.01883625
Iteration 30, loss = 0.01895932
Iteration 31, loss = 0.01860651
Iteration 32, los

In [8]:
data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,BJ,BK,BL,BM,BN,BO,BP,BQ,BR,BS
0,-1.0,-1.0,-1.0,0.96315,-0.78457,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.88299,-0.61528,-0.5,-0.98798,-0.89144,-0.96915,-0.86827,-0.91658,-1
1,-1.0,-1.0,-1.0,0.87268,-0.89746,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.92415,-0.82435,-0.78,-1.0,-0.84727,-0.87931,-0.71333,-0.9153,-1
2,-1.0,-1.0,-1.0,0.99997,-0.79803,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.82964,-0.45823,-0.52,-1.0,-0.85864,-0.85558,-0.7,-0.90841,-1
3,-1.0,-1.0,-1.0,0.90047,-0.85293,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.9157,-0.60502,-0.572,-1.0,-0.85323,-0.89725,-0.74833,-0.9112,-1
4,-1.0,-1.0,-1.0,0.95589,-0.82393,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-0.82776,-0.6635,-0.612,-0.98987,-0.84832,-0.88457,-0.74667,-0.91184,-1


In [9]:
start = time.time()

ypred=clf.predict(X_test)

end = time.time()
print('time of test = ' , end - start, ' sec')


time of test =  0.25688695907592773  sec


In [10]:
y_test=y_test.ravel()

TP=0
TN=0
FP=0
FN=0

for i in range(0,len(y_test),1):
    if ypred[i] == 1 and y_test[i] == 1:   TP=TP+1
    if ypred[i] == -1 and y_test[i] == -1: TN=TN+1
    if ypred[i] == -1 and y_test[i] == 1:  FP=FP+1
    if ypred[i] == 1 and y_test[i] == -1:  FN=FN+1

print('TP = ' , TP)
print('TN = ' , TN)
print('FP = ' , FP)
print('FN = ' , FN)


TP =  12718
TN =  9739
FP =  99
FN =  15


In [11]:
accuracy=(TP+TN)/(TP+TN+FP+FN);
sensitivity=TP/(TP+FN)
specificity=TN/(TN+FP)
precision=TP/(TP+FP);
recall=TP/(TP+FN);
f1=2*(precision*recall)/(precision+recall);

print('accuracy = ' , accuracy)
print('sensitivity = ' , sensitivity)
print('specificity = ' , specificity)
print('precision = ' , precision)
print('recall = ' , recall)
print('f1-score = ' , f1)

accuracy =  0.9949492711886935
sensitivity =  0.9988219586900181
specificity =  0.9899369790607847
precision =  0.9922758835921043
recall =  0.9988219586900181
f1-score =  0.9955381604696674
