In [17]:
'''Felix Andersson, Janine de Vries, DV2626'''

import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time

In [16]:
spam_data = pd.read_csv('spambase.data', delimiter=',')
spam_data.head()
spam_data.drop_duplicates(inplace=True)

x = spam_data.iloc[:, :-1]
y = spam_data.iloc[:, -1]
#X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


print("X:\n", x.head())  
print("Y:\n", y.head())

X:
       0  0.64  0.64.1  0.1  0.32   0.2   0.3   0.4   0.5   0.6  ...  0.40  \
0  0.21  0.28    0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...   0.0   
1  0.06  0.00    0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...   0.0   
2  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...   0.0   
3  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...   0.0   
4  0.00  0.00    0.00  0.0  1.85  0.00  0.00  1.85  0.00  0.00  ...   0.0   

   0.41   0.42  0.43  0.778   0.44   0.45  3.756   61   278  
0  0.00  0.132   0.0  0.372  0.180  0.048  5.114  101  1028  
1  0.01  0.143   0.0  0.276  0.184  0.010  9.821  485  2259  
2  0.00  0.137   0.0  0.137  0.000  0.000  3.537   40   191  
3  0.00  0.135   0.0  0.135  0.000  0.000  3.537   40   191  
4  0.00  0.223   0.0  0.000  0.000  0.000  3.000   15    54  

[5 rows x 57 columns]
Y:
 0    1
1    1
2    1
3    1
4    1
Name: 1, dtype: int64


In [23]:
classifiers = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}

# Stratified 10-Fold Cross-Validation
results = []
fold_column = {name: [] for name in classifiers}

skf = StratifiedKFold(n_splits=10)

for fold, (train_index, test_index) in enumerate(skf.split(x, y), 1):
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    for name, algorithm in classifiers.items():
        # Measure training time
        start_time = time.time()
        algorithm.fit(X_train, y_train)
        time_training = time.time() - start_time
        
        # Make predictions
        y_pred = algorithm.predict(X_test)

        # Measure accuracy and F1-score
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Store fold results
        fold_column[name].append(accuracy)

# Prepare results in the desired format
output = []

# Add results for each fold
for fold_num in range(10):
    fold_data = []
    for name, scores in fold_column.items():
        fold_data.append(f"{scores[fold_num]:.4f}")  # Format accuracy to 4 decimals
    output.append(fold_data)

# Calculate average and std deviation
avg_and_std = []
for name, scores in fold_column.items():
    avg_accuracy = np.mean(scores)
    std_accuracy = np.std(scores)
    avg_and_std.append([f"{avg_accuracy:.4f}", f"{std_accuracy:.4f}"])

# Add avg and stdev to the output
output.append([f"avg {item[0]}" for item in avg_and_std])
output.append([f"stdev {item[1]}" for item in avg_and_std])

# Convert output to DataFrame for presentation
fold_df = pd.DataFrame(output, columns=list(classifiers.keys()))
fold_df.index = [f"Fold {i}" for i in range(1, 11)] + ['avg', 'stdev']
fold_df.index.name = 'Fold/Algorithm'

# Display the results
print(fold_df)


                         SVM Random Forest           KNN
Fold/Algorithm                                          
Fold 1                0.6580        0.9501        0.7506
Fold 2                0.6627        0.9525        0.7648
Fold 3                0.7197        0.9264        0.7625
Fold 4                0.7648        0.9525        0.8076
Fold 5                0.6793        0.9501        0.8005
Fold 6                0.6936        0.9572        0.8029
Fold 7                0.7530        0.9667        0.7981
Fold 8                0.7316        0.9691        0.8242
Fold 9                0.7577        0.8812        0.7126
Fold 10               0.6667        0.8524        0.7619
avg               avg 0.7087    avg 0.9358    avg 0.7786
stdev           stdev 0.0396  stdev 0.0368  stdev 0.0320
