# Import dependencies

In [1]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans, AffinityPropagation
import warnings
warnings.filterwarnings("ignore")
import numpy as np

import torch
from torch.utils.tensorboard import SummaryWriter

from utils import clustering_classification, test_classifier, write, save_metrics_to_dict, encode_categorical_features, import_dataset, agg_clustering, writer_add_scalars

# For logging 

In [2]:
#writer = SummaryWriter(log_dir="C:\\awilde\\britta\\LTU\\AppliedAI\\runs")
writer = SummaryWriter()
metrics_dict={
"train_acc_dict" : {},
"train_f1_dict" : {},
"test_acc_dict" : {},
"test_f1_dict" : {},
"train_acc_avg" : {},
"train_f1_avg" : {},
"test_acc_avg" : {},
"test_f1_avg" : {},
}

In [3]:
# Launch tensorboard
# images=22 change this to how many datasets you use
%tensorboard --logdir runs/ --port 6006 --samples_per_plugin images=22
# If in use (Mac) use to find the process PID
% lsof -i :6006
# Kill the process with 
% kill -9 <PID>
# Then launch using bash with first command

UsageError: Line magic function `%tensorboard` not found.


# Config Params

In [4]:
TEST_SIZE = 0.2
RANDOM_SEEDS = [41, 42, 43, 44, 45]
K_FOLDS = 5

# Our chosen datasets
TODO
We have chosen the datasets based on the following criteria:
- datasets where we do not need to do any special preprocessing so that it is easy to do in only one pipeline
- rather small datasets to ensure we do not need high computational power

In [5]:
# Dataset that seems useful
dataset_id = {
    "iris": 53, 
    "heart_disease": 45, 
    "wine_quality": 186, 
    "breast_cancer_wisconsin_diagnostic": 17, 
    "car_evaluation": 19, 
    "spect_heart" : 95, 
    "spectf_heart" : 96,
    "mushroom": 73, 
    "statlog" : 144, 
    "credit_approval" : 27, 
    "zoo" : 111, 
    "balance_scale" : 12, 
    "ilpd" : 225, 
    "acute_inflamations" : 184, 
    "ecoli" : 39, 
    "mammographic_mass" : 161, 
    "hayes_roth" : 44, 
    "habermans_survival" : 43, 
    "congress_voting_records" : 105, 
    "balloons" : 13, 
    "lenses" : 58, 
    "fertility" : 244, 
}

# sort alphabetically and adds id for logging
data_set_sorted = {}
for i, name in enumerate(sorted(dataset_id.keys())):
    data_set_sorted[name] = (dataset_id[name], i+1)

print(data_set_sorted)

{'acute_inflamations': (184, 1), 'balance_scale': (12, 2), 'balloons': (13, 3), 'breast_cancer_wisconsin_diagnostic': (17, 4), 'car_evaluation': (19, 5), 'congress_voting_records': (105, 6), 'credit_approval': (27, 7), 'ecoli': (39, 8), 'fertility': (244, 9), 'habermans_survival': (43, 10), 'hayes_roth': (44, 11), 'heart_disease': (45, 12), 'ilpd': (225, 13), 'iris': (53, 14), 'lenses': (58, 15), 'mammographic_mass': (161, 16), 'mushroom': (73, 17), 'spect_heart': (95, 18), 'spectf_heart': (96, 19), 'statlog': (144, 20), 'wine_quality': (186, 21), 'zoo': (111, 22)}


# Import and preprocess datasets
For the preprocessing we will do the following steps:
1. Remove any missing values. In the article the following is written: "Given that our classifiers are not oriented to data with missing features, the missing inputs are treated as zero, which should not bias the comparison results." We therefore also decided to just remove missing values and to more focus on the full pipeline instead of single datasets. Another way could have been interpolation.
2. Encode categorical data into numerical data. This we have to do to use the classifiers later on.
3. Split the data into a train and a test set. We will use a 80/20 split.
4. Scale the data so that we have zero mean and standard deviation of one. This is done with the Standard scaler.

# Testing different classifiers

## K-Nearest Neighbor classifier

In [6]:
# https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee
knn_params = [{'knn__n_neighbors': [3, 5, 7, 9],
        'knn__weights': ['uniform', 'distance'],
        'knn__leaf_size': [15, 20]}]

## Support vector machine classifier

In [7]:
# params taken from here: https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/
svm_params = [{'svm__C': [0.1, 1, 10, 100, 1000],  
        'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
        'svm__kernel': ['rbf']} ]

## Logistic regression classifier

In [8]:
# params taken from here: https://www.geeksforgeeks.org/how-to-optimize-logistic-regression-performance/
# and from here https://www.kaggle.com/code/enespolat/grid-search-with-logistic-regression
# and here for fix: https://stackoverflow.com/questions/69257261/fitfailedwarning-estimator-fit-failed-the-score-on-this-train-test-partition-f
# and here https://scikit-learn.org/1.5/modules/linear_model.html#solvers%E2%80%99-details
log_reg_params = [
    {'log_reg__solver':["lbfgs", "saga"],
    'log_reg__penalty':['l2'],
    'log_reg__C' : np.logspace(-3,3,7),
    'log_reg__max_iter'  : [100,1000,2500]
}
]

## Random Forest classifier

In [9]:
# https://www.datacamp.com/tutorial/random-forests-classifier-python
random_forest_params = [
    {"random_forest__n_estimators": [100, 500],
     "random_forest__max_depth" : [5, 10, 15]
}
]

## Gaussian naive bayes classifier

In [10]:
# https://www.kaggle.com/code/akshaysharma001/naive-bayes-with-hyperpameter-tuning#Hyperparameter-Tuning-to-improve-Accuracy
gnb_params = [
    {'gnb__var_smoothing': np.logspace(0,-9, num=10)
    }
]

## K-means unsupervised classifier

In [11]:
# Already use k-means++ and we set the no. clusters to no. of labels
# Set params in loop
kmeans_params = []

## Affinity propagation unsupervised classifier

In [12]:
# https://letsdatascience.com/affinity-propagation-clustering/
affinity_propagation_params = [
    {"damping": [0.5, 0.7]}, 
          ]

# Train test loop

In [13]:
for i, (name, id) in enumerate(data_set_sorted.items()):
        print("\n" + "#"*100)
        print(f"Current dataset: {name}")
        ordinal_encoder = OrdinalEncoder()

        # Set up dataset
        X, y = import_dataset(data_set_sorted[name][0], ordinal_encoder)
        labels = np.unique(y)
        y = encode_categorical_features(y, ordinal_encoder)
        print(f"Dataset size: {len(X)}")
        print(f"Labels in dataset: {labels}")
        print("#"*100, end="\n\n")


        for RANDOM_SEED in RANDOM_SEEDS:   

                print("*"*100)
                print("Current seed:", RANDOM_SEED)
                print("*"*100, end="\n\n")

                # split the dataset
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify = y)

                # KNN
                clf_name = "knn"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                # https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee
                knn = KNeighborsClassifier()
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(knn, clf_name, knn_params, X_train, y_train, X_test, y_test)

                write(writer, name, clf_name, cm_train, "train", id[1] )
                write(writer, name, clf_name, cm_test, "test", id[1] )

                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )
                
                # SVM 
                svm = SVC(random_state=RANDOM_SEED)
                clf_name = "svm"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(svm, clf_name, svm_params, X_train, y_train, X_test, y_test)
                write(writer, name, clf_name, cm_train, "train", id[1] )
                write(writer, name, clf_name, cm_test, "test", id[1] )
                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )

                # Logistic regression
                log_reg = LogisticRegression(random_state=RANDOM_SEED)
                clf_name = "log_reg"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(log_reg, 
                                                                                        clf_name, 
                                                                                        log_reg_params, 
                                                                                        X_train, 
                                                                                        y_train, 
                                                                                        X_test, 
                                                                                        y_test)
                write(writer, name, clf_name, cm_train, "train", id[1] )
                write(writer, name, clf_name, cm_test, "test", id[1] )
                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )
                
                # Random forest
                random_forest = RandomForestClassifier(random_state=RANDOM_SEED)
                clf_name = "random_forest"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(random_forest, 
                                                                                        clf_name, 
                                                                                        random_forest_params, 
                                                                                        X_train, 
                                                                                        y_train, 
                                                                                        X_test, 
                                                                                        y_test)
                write(writer, name, clf_name, cm_train, "train", id[1] )
                write(writer, name, clf_name, cm_test, "test", id[1] )
                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )
                
                # Gaussian naive bayes
                gnb = GaussianNB()
                clf_name = "gnb"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(gnb, 
                                                                                        clf_name, 
                                                                                        gnb_params, 
                                                                                        X_train, 
                                                                                        y_train, 
                                                                                        X_test, 
                                                                                        y_test)
                write(writer, name, clf_name, cm_train, "train", id[1] )
                write(writer, name, clf_name, cm_test, "test", id[1] )
                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )
                
        
                # K-means
                clf_name = "kmeans"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                kmeans_params = [{"algorithm": ["lloyd", "elkan"]}]
                n_clusters = len(labels)
                kmeans_params.append({"n_clusters": [n_clusters]})
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = clustering_classification(KMeans, clf_name, kmeans_params, X_train, y_train, X_test, y_test, RANDOM_SEED, K_FOLDS)
                write(writer, name, clf_name, cm_train, "train", id[1])
                write(writer, name, clf_name, cm_test, "test", id[1] )
                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )

                # Agglomerative clustering
                clf_name = "agglomerative_clustering"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = agg_clustering(X_train, y_train, X_test, y_test, RANDOM_SEED)
                write(writer, name, clf_name, cm_train, "train", id[1] )
                write(writer, name, clf_name, cm_test, "test", id[1] )
                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )
                

                # Affinity propagation
                clf_name = "affinity_propagation"
                print("_"*100)
                print(f"Classifier: {clf_name}")
                train_acc, train_f1, test_acc, test_f1, cm_train, cm_test =  clustering_classification(AffinityPropagation, clf_name, affinity_propagation_params, X_train, y_train, X_test, y_test, RANDOM_SEED, K_FOLDS)
                write(writer, name, clf_name, cm_train, "train", id[1] )
                write(writer, name, clf_name, cm_test, "test", id[1] )
                metrics_dict = save_metrics_to_dict(clf_name,
                                                train_acc, 
                                                train_f1, 
                                                test_acc, 
                                                test_f1, 
                                                metrics_dict,
                                                i,
                                                RANDOM_SEED
                                                )

        # Take average
        keys_to_update = ["train_acc_dict", "test_acc_dict", "train_f1_dict", "test_f1_dict"]
        for key in keys_to_update:
                metrics_dict[key] = {k: v / len(RANDOM_SEEDS) for k, v in metrics_dict[key].items()}
        # Write metrics to tensorboard, step is dataset id      
        writer.add_scalars("Train accuracy", metrics_dict["train_acc_dict"], id[1])
        writer.add_scalars("Test accuracy", metrics_dict["test_acc_dict"], id[1])
        writer.add_scalars("Train f1",  metrics_dict["train_f1_dict"], id[1])
        writer.add_scalars("Test f1", metrics_dict["test_f1_dict"], id[1])

        writer_add_scalars("Train average accuracy", writer, metrics_dict["train_acc_avg"], id[1])
        writer_add_scalars("Test average accuracy", writer, metrics_dict["test_acc_avg"], id[1])
        writer_add_scalars("Train average f1", writer, metrics_dict["train_f1_avg"], id[1])
        writer_add_scalars("Test average f1", writer, metrics_dict["test_f1_avg"], id[1])
        
        # reset dictionaries
        keys_to_update = ["train_acc_dict", "test_acc_dict", "train_f1_dict", "test_f1_dict"]
        for key in keys_to_update:
                metrics_dict[key] = {}
        

writer.close()


####################################################################################################
Current dataset: acute_inflamations
Dataset size: 120
Labels in dataset: ['no' 'yes']
####################################################################################################

****************************************************************************************************
Current seed: 41
****************************************************************************************************

____________________________________________________________________________________________________
Classifier: knn
Cross validation best parameters: {'knn__leaf_size': 15, 'knn__n_neighbors': 3, 'knn__weights': 'uniform'}
knn, Train accuracy = 1.0, Test accuracy = 1.0
knn, Train f1-score = 1.0, Test f1-score = 1.0
____________________________________________________________________________________________________
Classifier: svm
Cross validation best parameters: {'svm__C': 0.1, 'svm_

In [14]:
rank_counter = 0
print(f"{'Rank':<5} {'Classifier':<30} {'Avg. test acc.':<17} {'Avg. train acc.':<17} {'Avg. test F1':<17} {'Avg train F1':<17}")
print("_"*100)
for i, clf in enumerate(sorted(metrics_dict["test_acc_avg"].items(), key=lambda x: x[1], reverse=True)):
    if clf[0].endswith("count"):
        rank_counter += 1
        continue
    print(f"{i+1-rank_counter:<5} {clf[0][:-4]:<30} {clf[1]:<17.3f} {metrics_dict['train_acc_avg'][clf[0]]:<8.3f}\
          {metrics_dict['test_f1_avg'][clf[0]]:<18.3f}{metrics_dict['train_f1_avg'][clf[0]]:<17.3f}")
print("_"*100)

Rank  Classifier                     Avg. test acc.    Avg. train acc.   Avg. test F1      Avg train F1     
____________________________________________________________________________________________________
1     svm                            0.842             0.920             0.825             0.908            
2     random_forest                  0.836             0.960             0.819             0.956            
3     knn                            0.813             0.930             0.801             0.925            
4     log_reg                        0.808             0.846             0.784             0.826            
5     gnb                            0.779             0.818             0.763             0.807            
6     affinity_propagation           0.775             0.819             0.750             0.800            
7     kmeans                         0.718             0.730             0.653             0.667            
8     agglomerative_cluster