# Import dependencies

In [92]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans, AffinityPropagation
import warnings
warnings.filterwarnings("ignore")
import numpy as np

import torch
from torch.utils.tensorboard import SummaryWriter

from clustering import clustering_classification, test_classifier, write, save_metrics_to_dict, encode_categorical_features, import_dataset, agg_clustering

# For logging 

In [93]:
#writer = SummaryWriter(log_dir="C:\\awilde\\britta\\LTU\\AppliedAI\\runs")
writer = SummaryWriter()
metrics_dict={
"train_acc_dict" : {},
"train_f1_dict" : {},
"test_acc_dict" : {},
"test_f1_dict" : {},
"train_acc_avg" : {},
"train_f1_avg" : {},
"test_acc_avg" : {},
"test_f1_avg" : {},
}

In [94]:
# Launch tensorboard
# images=21 change this to how many datasets you use
%tensorboard --logdir runs/ --port 6006 --samples_per_plugin images=22
# If in use (Mac) use to find the process PID
% lsof -i :6006
# Kill the process with 
% kill -9 <PID>
# Then launch using bash with first command

UsageError: Line magic function `%tensorboard` not found.


# Config Params

In [95]:
TEST_SIZE = 0.2
RANDOM_SEED = 42
K_FOLDS = 5

# Our chosen datasets
TODO
We have chosen the datasets based on the following criteria:
- datasets where we do not need to do any special preprocessing so that it is easy to do in only one pipeline
- rather small datasets to ensure we do not need high computational power

In [96]:
# Dataset that seems useful
dataset_id = {
    "iris": 53, 
    "heart_disease": 45, 
    "wine_quality": 186, 
    "breast_cancer_wisconsin_diagnostic": 17, 
    "car_evaluation": 19, 
    "spect_heart" : 95, 
    "spectf_heart" : 96,
    "mushroom": 73, 
    "statlog" : 144, 
    "credit_approval" : 27, 
    "zoo" : 111, 
    "balance_scale" : 12, 
    "ilpd" : 225, 
    "acute_inflamations" : 184, 
    "ecoli" : 39, 
    "mammographic_mass" : 161, 
    "hayes_roth" : 44, 
    "habermans_survival" : 43, 
    "congress_voting_records" : 105, 
    "balloons" : 13, 
    "lenses" : 58, 
    "fertility" : 244, 
}

# sort alphabetically and adds id for logging
data_set_sorted = {}
for i, name in enumerate(sorted(dataset_id.keys())):
    data_set_sorted[name] = (dataset_id[name], i+1)

print(data_set_sorted)

{'acute_inflamations': (184, 1), 'balance_scale': (12, 2), 'balloons': (13, 3), 'breast_cancer_wisconsin_diagnostic': (17, 4), 'car_evaluation': (19, 5), 'congress_voting_records': (105, 6), 'credit_approval': (27, 7), 'ecoli': (39, 8), 'fertility': (244, 9), 'habermans_survival': (43, 10), 'hayes_roth': (44, 11), 'heart_disease': (45, 12), 'ilpd': (225, 13), 'iris': (53, 14), 'lenses': (58, 15), 'mammographic_mass': (161, 16), 'mushroom': (73, 17), 'spect_heart': (95, 18), 'spectf_heart': (96, 19), 'statlog': (144, 20), 'wine_quality': (186, 21), 'zoo': (111, 22)}


# Import and preprocess datasets
For the preprocessing we will do the following steps:
1. Remove any missing values. In the article the following is written: "Given that our classifiers are not oriented to data with missing features, the missing inputs are treated as zero, which should not bias the comparison results." We therefore also decided to just remove missing values and to more focus on the full pipeline instead of single datasets. Another way could have been interpolation.
2. Encode categorical data into numerical data. This we have to do to use the classifiers later on.
3. Remove certain columns if they are highly correlated to others. <span style="color: red;">ALERT!</span>
4. Split the data into a train and a test set. We will use a 80/20 split.
5. Scale the data so that we have zero mean and standard deviation of one. This is done with the Standard scaler.

# Testing different classifiers

## K-Nearest Neighbor classifier

In [97]:
# https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee
knn_params = [{'knn__n_neighbors': [3, 5, 7, 9],
        'knn__weights': ['uniform', 'distance'],
        'knn__leaf_size': [15, 20]}]

## Support vector machine classifier

In [98]:
# params taken from here: https://www.geeksforgeeks.org/svm-hyperparameter-tuning-using-gridsearchcv-ml/
svm_params = [{'svm__C': [0.1, 1, 10, 100, 1000],  
        'svm__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
        'svm__kernel': ['rbf']} ]

## Logistic regression classifier

In [99]:
# params taken from here: https://www.geeksforgeeks.org/how-to-optimize-logistic-regression-performance/
# and from here https://www.kaggle.com/code/enespolat/grid-search-with-logistic-regression
log_reg_params = [
    {'log_reg__penalty':['l1','l2'],
    'log_reg__C' : np.logspace(-3,3,7),
    'log_reg__max_iter'  : [100,1000,2500,5000]
}
]

## Random Forest classifier

In [100]:
# https://www.datacamp.com/tutorial/random-forests-classifier-python
random_forest_params = [
    {"random_forest__n_estimators": [100, 500],
     "random_forest__max_depth" : [5, 10, 15]
}
]

## Gaussian naive bayes classifier

In [101]:
# https://www.kaggle.com/code/akshaysharma001/naive-bayes-with-hyperpameter-tuning#Hyperparameter-Tuning-to-improve-Accuracy
gnb_params = [
    {'gnb__var_smoothing': np.logspace(0,-9, num=10)
    }
]

## K-means unsupervised classifier

In [102]:
# Already use k-means++ and we set the no. clusters to no. of labels
kmeans_params = []

## Affinity propagation unsupervised classifier

In [103]:
# https://letsdatascience.com/affinity-propagation-clustering/
affinity_propagation_params = [
    {"damping": [0.5, 0.7]}, 
    {"preference": [-50,-10,0,10,50]}
          ]

# Train test loop

In [104]:
for i, (name, id) in enumerate(data_set_sorted.items()):
        print("\n" + "*"*100)
        print(f"Current dataset: {name}")
        ordinal_encoder = OrdinalEncoder()

        # Set up dataset
        X, y = import_dataset(dataset_id[name], ordinal_encoder)
        labels = np.unique(y)
        y = encode_categorical_features(y, ordinal_encoder)
        print(f"Dataset size: {len(X)}")
        print(f"Labels in dataset: {labels}")

        # split the dataset
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify = y)
        print("*"*100, end="\n\n")

        # KNN
        clf_name = "knn"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        # https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee
        knn = KNeighborsClassifier()
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(knn, clf_name, knn_params, X_train, y_train, X_test, y_test)

        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1])

        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )
        
        # SVM 
        svm = SVC()
        clf_name = "svm"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(svm, clf_name, svm_params, X_train, y_train, X_test, y_test)
        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1] )
        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )

        # Logistic regression
        log_reg = LogisticRegression()
        clf_name = "log_reg"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(log_reg, 
                                                                                    clf_name, 
                                                                                    log_reg_params, 
                                                                                    X_train, 
                                                                                    y_train, 
                                                                                    X_test, 
                                                                                    y_test)
        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1] )
        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )
        
        # Random forest
        random_forest = RandomForestClassifier()
        clf_name = "random_forest"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(random_forest, 
                                                                                    clf_name, 
                                                                                    random_forest_params, 
                                                                                    X_train, 
                                                                                    y_train, 
                                                                                    X_test, 
                                                                                    y_test)
        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1] )
        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )
        
        # Gaussian naive bayes
        gnb = GaussianNB()
        clf_name = "gnb"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = test_classifier(gnb, 
                                                                                    clf_name, 
                                                                                    gnb_params, 
                                                                                    X_train, 
                                                                                    y_train, 
                                                                                    X_test, 
                                                                                    y_test)
        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1] )
        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )
        
       
        # K-means
        clf_name = "kmeans"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        kmeans_params = [{"algorithm": ["lloyd", "elkan"]}]
        n_clusters = len(labels)
        kmeans_params.append({"n_clusters": [n_clusters]})
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = clustering_classification(KMeans, clf_name, kmeans_params, X_train, y_train, X_test, y_test, RANDOM_SEED, K_FOLDS)
        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1] )
        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )

        # Agglomerative clustering
        clf_name = "agglomerative_clustering"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test = agg_clustering(X_train, y_train, X_test, y_test, RANDOM_SEED)
        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1])
        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )
        

        # Affinity propagation
        clf_name = "affinity_propagation"
        print("_"*100)
        print(f"Classifier: {clf_name}")
        train_acc, train_f1, test_acc, test_f1, cm_train, cm_test =  clustering_classification(AffinityPropagation, clf_name, affinity_propagation_params, X_train, y_train, X_test, y_test, RANDOM_SEED, K_FOLDS)
        write(writer, name, clf_name, cm_train, "train", id[1])
        write(writer, name, clf_name, cm_test, "test", id[1])
        metrics_dict = save_metrics_to_dict(clf_name,
                                            train_acc, 
                                            train_f1, 
                                            test_acc, 
                                            test_f1, 
                                            metrics_dict,
                                            i
                                            )

        # Write metrics to tensorboard, step is dataset id
        writer.add_scalars("Train accuracy", metrics_dict["train_acc_dict"], id[1])
        writer.add_scalars("Test accuracy", metrics_dict["test_acc_dict"], id[1])
        writer.add_scalars("Train f1",  metrics_dict["train_f1_dict"], id[1])
        writer.add_scalars("Test f1", metrics_dict["test_f1_dict"], id[1])
        writer.add_scalars("Train average accuracy", metrics_dict["train_acc_avg"], id[1])
        writer.add_scalars("Test average accuracy", metrics_dict["test_acc_avg"], id[1])
        writer.add_scalars("Train average f1", metrics_dict["train_f1_avg"], id[1])
        writer.add_scalars("Test average f1", metrics_dict["test_f1_avg"], id[1])


writer.close()


****************************************************************************************************
Current dataset: acute_inflamations
Dataset size: 120
Labels in dataset: ['no' 'yes']
****************************************************************************************************

____________________________________________________________________________________________________
Classifier: knn
Cross validation best parameters: {'knn__leaf_size': 15, 'knn__n_neighbors': 3, 'knn__weights': 'uniform'}
knn, Train accuracy = 1.0, Test accuracy = 1.0
knn, Train f1-score = 1.0, Test f1-score = 1.0
____________________________________________________________________________________________________
Classifier: svm
Cross validation best parameters: {'svm__C': 0.1, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}
svm, Train accuracy = 1.0, Test accuracy = 1.0
svm, Train f1-score = 1.0, Test f1-score = 1.0
__________________________________________________________________________________________

In [105]:
print(f"{'Rank':<5} {'Classifier':<30} {'Avg. test acc.':<17} {'Avg. train acc.':<17} {'Avg. test F1':<17} {'Avg train F1':<17}")
print("_"*100)
for i, clf in enumerate(sorted(metrics_dict["test_acc_avg"].items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1:<5} {clf[0][:-4]:<30} {clf[1]:<17.2f} {metrics_dict['train_acc_avg'][clf[0]]:<8.2f}\
          {metrics_dict['test_f1_avg'][clf[0]]:<18.2f}{metrics_dict['train_f1_avg'][clf[0]]:<17.2}")
print("_"*100)

Rank  Classifier                     Avg. test acc.    Avg. train acc.   Avg. test F1      Avg train F1     
____________________________________________________________________________________________________
1     random_forest                  0.87              0.99              0.86              0.99             
2     svm                            0.86              0.97              0.85              0.97             
3     knn                            0.85              0.96              0.85              0.96             
4     log_reg                        0.84              0.84              0.83              0.83             
5     gnb                            0.80              0.82              0.80              0.81             
6     affinity_propagation           0.79              0.77              0.77              0.75             
7     agglomerative_clustering       0.79              0.74              0.73              0.67             
8     kmeans               