In [9]:
'''Felix Andersson, Janine de Vries, DV2626'''

import pandas as pd
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import time


In [None]:
spam_data = pd.read_csv('spambase.data', delimiter=',')
spam_data.head()
spam_data.drop_duplicates(inplace=True)

x = spam_data.iloc[:, :-1]
y = spam_data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


print("X:\n", x.head())  
print("Y:\n", y.head())  

X:
       0  0.64  0.64.1  0.1  0.32   0.2   0.3   0.4   0.5   0.6  ...  0.40  \
0  0.21  0.28    0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...   0.0   
1  0.06  0.00    0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...   0.0   
2  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...   0.0   
3  0.00  0.00    0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...   0.0   
4  0.00  0.00    0.00  0.0  1.85  0.00  0.00  1.85  0.00  0.00  ...   0.0   

   0.41   0.42  0.43  0.778   0.44   0.45  3.756   61   278  
0  0.00  0.132   0.0  0.372  0.180  0.048  5.114  101  1028  
1  0.01  0.143   0.0  0.276  0.184  0.010  9.821  485  2259  
2  0.00  0.137   0.0  0.137  0.000  0.000  3.537   40   191  
3  0.00  0.135   0.0  0.135  0.000  0.000  3.537   40   191  
4  0.00  0.223   0.0  0.000  0.000  0.000  3.000   15    54  

[5 rows x 57 columns]
Y:
 0    1
1    1
2    1
3    1
4    1
Name: 1, dtype: int64


In [None]:
svm = SVC()

start_time = time.time()
svm.fit(X_train, y_train)
svm_training_time = time.time() - start_time

svm_pred = svm.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_pred)
svm_f1_score = f1_score(y_test, svm_pred, average='weighted')

print(f"SVM Training Time: {svm_training_time:.4f} seconds")
print(f"SVM Accuracy: {svm_accuracy:.4f}")
print(f"SVM F1 Score: {svm_f1_score:.4f}")

SVM Training Time: 0.3051 seconds
SVM Accuracy: 0.7067
SVM F1 Score: 0.6782


In [None]:
rf = RandomForestClassifier()

start_time = time.time()
rf.fit(X_train, y_train)
rf_training_time = time.time() - start_time

rf_pred = rf.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_f1_score = f1_score(y_test, rf_pred, average='weighted')

print(f"Random Forest Training Time: {rf_training_time:.4f} seconds")
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"Random Forest F1 Score: {rf_f1_score:.4f}")

Random Forest Training Time: 0.4508 seconds
Random Forest Accuracy: 0.9501
Random Forest F1 Score: 0.9499


In [None]:
knn = KNeighborsClassifier()

start_time = time.time()
knn.fit(X_train, y_train)
knn_training_time = time.time() - start_time

knn_pred = knn.predict(X_test)

knn_accuracy = accuracy_score(y_test, knn_pred)
knn_f1_score = f1_score(y_test, knn_pred, average='weighted')

print(f"KNN Training Time: {knn_training_time:.4f} seconds")
print(f"KNN Accuracy: {knn_accuracy:.4f}")
print(f"KNN F1 Score: {knn_f1_score:.4f}")

KNN Training Time: 0.0023 seconds
KNN Accuracy: 0.7732
KNN F1 Score: 0.7717


In [14]:
comparison_results = pd.DataFrame({
    'Algorithm': ['SVM', 'Random Forest', 'KNN'],
    'Training Time (seconds)': [svm_training_time, rf_training_time, knn_training_time],
    'Accuracy': [svm_accuracy, rf_accuracy, knn_accuracy],
    'F1 Score': [svm_f1_score, rf_f1_score, knn_f1_score]
})

print(comparison_results)

       Algorithm  Training Time (seconds)  Accuracy  F1 Score
0            SVM                 0.305090  0.706651  0.678249
1  Random Forest                 0.450766  0.950119  0.949934
2            KNN                 0.002342  0.773159  0.771691
