In [89]:
'''Felix Andersson, Janine de Vries, DV2626'''

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time


In [90]:
spam_data = pd.read_csv('spambase.data', delimiter=',')
spam_data.head()
spam_data.drop_duplicates(inplace=True)

x = spam_data.iloc[:, :-1]
y = spam_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


print("X:\n", x.head()) 
print("Y:\n", y.head())  

X:
       0  0.64  0.64.1  0.1  0.32   0.2   0.3   0.4   0.5   0.6  ...  0.40  \
0  0.21  0.28    0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...   0.0   
1  0.06  0.00    0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...   0.0   
2  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...   0.0   
3  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...   0.0   
4  0.00  0.00    0.00  0.0  1.85  0.00  0.00  1.85  0.00  0.00  ...   0.0   

   0.41   0.42  0.43  0.778   0.44   0.45  3.756   61   278  
0  0.00  0.132   0.0  0.372  0.180  0.048  5.114  101  1028  
1  0.01  0.143   0.0  0.276  0.184  0.010  9.821  485  2259  
2  0.00  0.137   0.0  0.137  0.000  0.000  3.537   40   191  
3  0.00  0.135   0.0  0.135  0.000  0.000  3.537   40   191  
4  0.00  0.223   0.0  0.000  0.000  0.000  3.000   15    54  

[5 rows x 57 columns]
Y:
 0    1
1    1
2    1
3    1
4    1
Name: 1, dtype: int64


In [None]:
'''10 fold cross validation'''

classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

metrics_table = {name: {'time': [], 'accuracy': [], 'f1': []} for name in classifiers}

skf = StratifiedKFold(n_splits=10)

for fold, (train_index, test_index) in enumerate(skf.split(x, y), 1):
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    for name, algorithm in classifiers.items():
        start_time = time.time()
        algorithm.fit(X_train, y_train)
        time_training = time.time() - start_time  

        y_pred = algorithm.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        metrics_table[name]['time'].append(time_training)
        metrics_table[name]['accuracy'].append(accuracy)
        metrics_table[name]['f1'].append(f1)


In [None]:
'''create dataframes in order to display the results'''

output_accuracy = []
output_f1 = []
output_time = []

for fold_num in range(10):
    fold_data_accuracy = []
    fold_data_f1 = []
    fold_data_time = []
    
    for name in classifiers.keys():
        fold_data_accuracy.append(f"{metrics_table[name]['accuracy'][fold_num]:.4f}")
        fold_data_f1.append(f"{metrics_table[name]['f1'][fold_num]:.4f}")
        fold_data_time.append(f"{metrics_table[name]['time'][fold_num]:.4f}")
    
    output_accuracy.append(fold_data_accuracy)
    output_f1.append(fold_data_f1)
    output_time.append(fold_data_time)

avg_and_std_accuracy = []
avg_and_std_f1 = []
avg_and_std_time = []

for name in classifiers.keys():
    avg_accuracy = np.mean(metrics_table[name]['accuracy'])
    std_accuracy = np.std(metrics_table[name]['accuracy'])
    avg_and_std_accuracy.append([f"{avg_accuracy:.4f}", f"{std_accuracy:.4f}"])
    
    avg_f1 = np.mean(metrics_table[name]['f1'])
    std_f1 = np.std(metrics_table[name]['f1'])
    avg_and_std_f1.append([f"{avg_f1:.4f}", f"{std_f1:.4f}"])
    
    avg_time = np.mean(metrics_table[name]['time'])
    std_time = np.std(metrics_table[name]['time'])
    avg_and_std_time.append([f"{avg_time:.4f}", f"{std_time:.4f}"])

    
output_accuracy.append([f"avg {item[0]}" for item in avg_and_std_accuracy])
output_accuracy.append([f"stdev {item[1]}" for item in avg_and_std_accuracy])

output_f1.append([f"avg {item[0]}" for item in avg_and_std_f1])
output_f1.append([f"stdev {item[1]}" for item in avg_and_std_f1])

output_time.append([f"avg {item[0]}" for item in avg_and_std_time])
output_time.append([f"stdev {item[1]}" for item in avg_and_std_time])

accuracy_df = pd.DataFrame(output_accuracy, columns=list(classifiers.keys()))
f1_df = pd.DataFrame(output_f1, columns=list(classifiers.keys()))
time_df = pd.DataFrame(output_time, columns=list(classifiers.keys()))

accuracy_df.index = [f"Fold {i}" for i in range(1, 11)] + ['avg', 'stdev']
accuracy_df.index.name = 'Avrage'

f1_df.index = [f"Fold {i}" for i in range(1, 11)] + ['avg', 'stdev']
f1_df.index.name = 'Avrage'

time_df.index = [f"Fold {i}" for i in range(1, 11)] + ['avg', 'stdev']
time_df.index.name = 'Avrage'

In [None]:
"""
Print the tables and calculate the Friedman statistic
avrage rank = 2,
    k+1/2 = 2, k = 3
"""

print("Accuracy:")
print(accuracy_df)
svm,rf,knn = 0,0,0
for index, row in accuracy_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += rank.iloc[0]
    rf += rank.iloc[1]
    knn += rank.iloc[2]
svm = svm/10
rf = rf/10
knn = knn/10
sqrd_dif_j = 10*((svm-2)**2 + (rf-2)**2 + (knn-2)**2)
print(sqrd_dif_j)
svm,rf,knn = 0,0,0

for index, row in accuracy_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += (rank.iloc[0]-2)**2
    rf += (rank.iloc[1]-2)**2
    knn += (rank.iloc[2]-2)**2
sqrd_dif_ij = (1/20)*(svm + rf + knn)
friedman_statistic = sqrd_dif_j/sqrd_dif_ij 
print("\nFriedman statistic value: ",friedman_statistic)

Accuracy:
                  SVM Random Forest           KNN
Avrage                                           
Fold 1         0.6580        0.9454        0.7506
Fold 2         0.6627        0.9501        0.7648
Fold 3         0.7197        0.9240        0.7625
Fold 4         0.7648        0.9549        0.8076
Fold 5         0.6793        0.9477        0.8005
Fold 6         0.6936        0.9549        0.8029
Fold 7         0.7530        0.9596        0.7981
Fold 8         0.7316        0.9667        0.8242
Fold 9         0.7577        0.8884        0.7126
Fold 10        0.6667        0.8500        0.7619
avg        avg 0.7087    avg 0.9342    avg 0.7786
stdev    stdev 0.0396  stdev 0.0352  stdev 0.0320
18.2

Friedman statistic value:  18.2


In [None]:
"""
Print the tables and calculate the Friedman statistic

"""
print("\nF1 Score Table:")
print(f1_df)
svm,rf,knn = 0,0,0
for index, row in f1_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += rank.iloc[0]
    rf += rank.iloc[1]
    knn += rank.iloc[2]
svm = svm/10
rf = rf/10
knn = knn/10
sqrd_dif_j = 10*((svm-2)**2 + (rf-2)**2 + (knn-2)**2)
print(sqrd_dif_j)
svm,rf,knn = 0,0,0

for index, row in f1_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += (rank.iloc[0]-2)**2
    rf += (rank.iloc[1]-2)**2
    knn += (rank.iloc[2]-2)**2
sqrd_dif_ij = (1/20)*(svm + rf + knn)
print(sqrd_dif_ij)
friedman_statistic = sqrd_dif_j/sqrd_dif_ij 
print("\nFriedman statistic value: ",friedman_statistic)


F1 Score Table:
                  SVM Random Forest           KNN
Avrage                                           
Fold 1         0.6337        0.9451        0.7496
Fold 2         0.6388        0.9500        0.7645
Fold 3         0.6907        0.9231        0.7602
Fold 4         0.7514        0.9547        0.8053
Fold 5         0.6540        0.9475        0.8009
Fold 6         0.6761        0.9549        0.8031
Fold 7         0.7330        0.9595        0.7951
Fold 8         0.7085        0.9666        0.8225
Fold 9         0.7292        0.8892        0.7155
Fold 10        0.6344        0.8498        0.7619
avg        avg 0.6850    avg 0.9340    avg 0.7779
stdev    stdev 0.0420  stdev 0.0352  stdev 0.0310
18.2
1.0

Friedman statistic value:  18.2


In [None]:
"""
Print the tables and calculate the Friedman statistic

"""
print("\nTime Table:")
print(time_df)
svm,rf,knn = 0,0,0
for index, row in time_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += rank.iloc[0]
    rf += rank.iloc[1]
    knn += rank.iloc[2]
svm = svm/10
rf = rf/10
knn = knn/10
sqrd_dif_j = 10*((svm-2)**2 + (rf-2)**2 + (knn-2)**2)
print(sqrd_dif_j)
svm,rf,knn = 0,0,0

for index, row in time_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += (rank.iloc[0]-2)**2
    rf += (rank.iloc[1]-2)**2
    knn += (rank.iloc[2]-2)**2
sqrd_dif_ij = (1/20)*(svm + rf + knn)
print(sqrd_dif_ij)
friedman_statistic = sqrd_dif_j/sqrd_dif_ij 
print("\nFriedman statistic value: ",friedman_statistic)


Time Table:
                  SVM Random Forest           KNN
Avrage                                           
Fold 1         0.3592        0.5105        0.0020
Fold 2         0.3770        0.5218        0.0022
Fold 3         0.3694        0.5139        0.0020
Fold 4         0.3847        0.5007        0.0021
Fold 5         0.3565        0.5019        0.0022
Fold 6         0.3931        0.5041        0.0021
Fold 7         0.3941        0.5008        0.0022
Fold 8         0.3967        0.5179        0.0022
Fold 9         0.3850        0.5008        0.0021
Fold 10        0.3752        0.4968        0.0023
avg        avg 0.3791    avg 0.5069    avg 0.0021
stdev    stdev 0.0135  stdev 0.0081  stdev 0.0001
20.0
1.0

Friedman statistic value:  20.0
