In [None]:
'''Felix Andersson, Janine de Vries, DV2626'''

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time


In [None]:
spam_data = pd.read_csv('spambase.data', delimiter=',')
spam_data.head()
spam_data.drop_duplicates(inplace=True)

x = spam_data.iloc[:, :-1]
y = spam_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


print("X:\n", x.head()) 
print("Y:\n", y.head())  

In [None]:
'''10 fold cross validation'''

classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

result = {name: {'time': [], 'accuracy': [], 'f1': []} for name in classifiers}

skf = StratifiedKFold(n_splits=10)

for train_idx, test_idx in skf.split(x, y):
    X_train, X_test = x.iloc[train_idx], x.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for name, algorithm in classifiers.items():
        start_time = time.time()
        algorithm.fit(X_train, y_train)
        time_training = time.time() - start_time  

        y_pred = algorithm.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        result[name]['time'].append(time_training)
        result[name]['accuracy'].append(accuracy)
        result[name]['f1'].append(f1)


In [None]:
'''create dataframes in order to display the results'''

output_accuracy = []
output_f1 = []
output_time = []

for fold_num in range(10):
    fold_data_accuracy = []
    fold_data_f1 = []
    fold_data_time = []
    
    for name in classifiers.keys():
        fold_data_accuracy.append(f"{result[name]['accuracy'][fold_num]:.4f}")
        fold_data_f1.append(f"{result[name]['f1'][fold_num]:.4f}")
        fold_data_time.append(f"{result[name]['time'][fold_num]:.4f}")
    
    output_accuracy.append(fold_data_accuracy)
    output_f1.append(fold_data_f1)
    output_time.append(fold_data_time)

avg_and_std_accuracy = []
avg_and_std_f1 = []
avg_and_std_time = []

for name in classifiers.keys():
    avg_accuracy = np.mean(result[name]['accuracy'])
    std_accuracy = np.std(result[name]['accuracy'])
    avg_and_std_accuracy.append([f"{avg_accuracy:.4f}", f"{std_accuracy:.4f}"])
    
    avg_f1 = np.mean(result[name]['f1'])
    std_f1 = np.std(result[name]['f1'])
    avg_and_std_f1.append([f"{avg_f1:.4f}", f"{std_f1:.4f}"])
    
    avg_time = np.mean(result[name]['time'])
    std_time = np.std(result[name]['time'])
    avg_and_std_time.append([f"{avg_time:.4f}", f"{std_time:.4f}"])

    
output_accuracy.append([f"avg {item[0]}" for item in avg_and_std_accuracy])
output_accuracy.append([f"stdev {item[1]}" for item in avg_and_std_accuracy])

output_f1.append([f"avg {item[0]}" for item in avg_and_std_f1])
output_f1.append([f"stdev {item[1]}" for item in avg_and_std_f1])

output_time.append([f"avg {item[0]}" for item in avg_and_std_time])
output_time.append([f"stdev {item[1]}" for item in avg_and_std_time])

accuracy_df = pd.DataFrame(output_accuracy, columns=list(classifiers.keys()))
f1_df = pd.DataFrame(output_f1, columns=list(classifiers.keys()))
time_df = pd.DataFrame(output_time, columns=list(classifiers.keys()))

accuracy_df.index = [f"Fold {i}" for i in range(1, 11)] + ['avg', 'stdev']
accuracy_df.index.name = 'Avrage'

f1_df.index = [f"Fold {i}" for i in range(1, 11)] + ['avg', 'stdev']
f1_df.index.name = 'Avrage'

time_df.index = [f"Fold {i}" for i in range(1, 11)] + ['avg', 'stdev']
time_df.index.name = 'Average'

In [None]:
"""
Print the tables and calculate the Friedman statistic
avrage rank = 2,
    k+1/2 = 2, k = 3
"""

print("Accuracy:")
print(accuracy_df)
svm,rf,knn = 0,0,0
for index, row in accuracy_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += rank.iloc[0]
    rf += rank.iloc[1]
    knn += rank.iloc[2]
svm = svm/10
rf = rf/10
knn = knn/10
sqrd_dif_j = 10*((svm-2)**2 + (rf-2)**2 + (knn-2)**2)
#print(sqrd_dif_j)
svm,rf,knn = 0,0,0

for index, row in accuracy_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += (rank.iloc[0]-2)**2
    rf += (rank.iloc[1]-2)**2
    knn += (rank.iloc[2]-2)**2
sqrd_dif_ij = (1/20)*(svm + rf + knn)
friedman_statistic = sqrd_dif_j/sqrd_dif_ij 
print("\nFriedman statistic value: ",friedman_statistic)

In [None]:
"""
Print the tables and calculate the Friedman statistic

"""
print("\nF1 Score Table:")
print(f1_df)
svm,rf,knn = 0,0,0
for index, row in f1_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += rank.iloc[0]
    rf += rank.iloc[1]
    knn += rank.iloc[2]
svm = svm/10
rf = rf/10
knn = knn/10
sqrd_dif_j = 10*((svm-2)**2 + (rf-2)**2 + (knn-2)**2)
#print(sqrd_dif_j)
svm,rf,knn = 0,0,0

for index, row in f1_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += (rank.iloc[0]-2)**2
    rf += (rank.iloc[1]-2)**2
    knn += (rank.iloc[2]-2)**2
sqrd_dif_ij = (1/20)*(svm + rf + knn)
#print(sqrd_dif_ij)
friedman_statistic = sqrd_dif_j/sqrd_dif_ij 
print("\nFriedman statistic value: ",friedman_statistic)

In [None]:
"""
Print the tables and calculate the Friedman statistic

"""
print("\nTime Table:")
print(time_df)
svm,rf,knn = 0,0,0
for index, row in time_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += rank.iloc[0]
    rf += rank.iloc[1]
    knn += rank.iloc[2]
svm = svm/10
rf = rf/10
knn = knn/10
sqrd_dif_j = 10*((svm-2)**2 + (rf-2)**2 + (knn-2)**2)
#print(sqrd_dif_j)
svm,rf,knn = 0,0,0

for index, row in time_df.iloc[:10].iterrows():
    rank = row.rank(ascending=True, method="min")
    svm += (rank.iloc[0]-2)**2
    rf += (rank.iloc[1]-2)**2
    knn += (rank.iloc[2]-2)**2
sqrd_dif_ij = (1/20)*(svm + rf + knn)
#print(sqrd_dif_ij)
friedman_statistic = sqrd_dif_j/sqrd_dif_ij 
print("\nFriedman statistic value: ",friedman_statistic)