In [1]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.cluster import KMeans, AffinityPropagation
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo

import torch
from torch.utils.tensorboard import SummaryWriter

from utils import clustering_classification, test_classifier, write, save_metrics_to_dict, encode_categorical_features, import_dataset, agg_clustering

In [2]:
TEST_SIZE = 0.2
RANDOM_SEED = 42
K_FOLDS = 5

In [5]:
dataset_id = {
    "iris": 53, 
    "heart_disease": 45, 
    "wine_quality": 186, 
    "breast_cancer_wisconsin_diagnostic": 17, 
    "car_evaluation": 19, 
    "spect_heart" : 95, 
    "spectf_heart" : 96,
    "mushroom": 73, 
    "statlog" : 144, 
    "credit_approval" : 27, 
    "zoo" : 111, 
    "balance_scale" : 12, 
    "ilpd" : 225, 
    "acute_inflamations" : 184, 
    "ecoli" : 39, 
    "mammographic_mass" : 161, 
    "hayes_roth" : 44, 
    "habermans_survival" : 43, 
    "congress_voting_records" : 105, 
    "balloons" : 13, 
    "lenses" : 58, 
    "fertility" : 244, 
}
# sort alphabetically and adds id for logging
data_set_sorted = {}
for i, name in enumerate(sorted(dataset_id.keys())):
    data_set_sorted[name] = (dataset_id[name], i+1)
print(data_set_sorted)

{'acute_inflamations': (184, 1), 'balance_scale': (12, 2), 'balloons': (13, 3), 'breast_cancer_wisconsin_diagnostic': (17, 4), 'car_evaluation': (19, 5), 'congress_voting_records': (105, 6), 'credit_approval': (27, 7), 'ecoli': (39, 8), 'fertility': (244, 9), 'habermans_survival': (43, 10), 'hayes_roth': (44, 11), 'heart_disease': (45, 12), 'ilpd': (225, 13), 'iris': (53, 14), 'lenses': (58, 15), 'mammographic_mass': (161, 16), 'mushroom': (73, 17), 'spect_heart': (95, 18), 'spectf_heart': (96, 19), 'statlog': (144, 20), 'wine_quality': (186, 21), 'zoo': (111, 22)}


In [None]:
def import_dataset(uci_id, encoder):
    # get the dataset
    dataset = fetch_ucirepo(id=uci_id) 
    # load data into dataframe for easier preprocessing
    df = pd.concat([dataset["data"]["features"],dataset["data"]["targets"] ],axis=1)
    # remove nan values
    initial_rows = df.shape[0]
    df.dropna(inplace=True)
    final_rows = df.shape[0]
    if initial_rows > final_rows:
        print(f"Rows were removed. Initial rows: {initial_rows}, Final rows: {final_rows}, Removed rows: {initial_rows-final_rows}")
    else:
        print("No rows were removed.")
    
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1:]
    # encode categorical data only for features not for target itself
    # https://stackoverflow.com/questions/29803093/check-which-columns-in-dataframe-are-categorical
    cols = X.columns
    num_cols = X._get_numeric_data().columns
    #print(num_cols)
    categorical_cols = list(set(cols) - set(num_cols))
    #print(categorical_cols)
    X.loc[:, categorical_cols] = encode_categorical_features(X[categorical_cols], encoder)
    
    # check if encoding has worked
    # https://stackoverflow.com/questions/26924904/check-if-dataframe-column-is-categorical
    for c in X.columns:
        if X[c].dtype.name == "category":
            print(f"WARNING: Column {c} still has categorical values!")
            
    # last column is target
    return X, y

In [4]:
for i, (name, id) in enumerate(data_set_sorted.items()):
        print("\n" + "*"*100)
        print(f"Current dataset: {name}")
        ordinal_encoder = OrdinalEncoder()

        # Set up dataset
        X, y = import_dataset(dataset_id[name], ordinal_encoder)
        labels = np.unique(y)
        y = encode_categorical_features(y, ordinal_encoder)
        print(f"Dataset size: {len(X)}")
        print(f"Labels in dataset: {labels}, amount: {len(labels)}")
        print(f"Number of features: {X.shape[1]}")
        #print(X.head)


****************************************************************************************************
Current dataset: acute_inflamations
No rows were removed.
Dataset size: 120
Labels in dataset: ['no' 'yes'], amount: 2
Number of features: 7

****************************************************************************************************
Current dataset: balance_scale
No rows were removed.
Dataset size: 625
Labels in dataset: ['B' 'L' 'R'], amount: 3
Number of features: 4

****************************************************************************************************
Current dataset: balloons
No rows were removed.
Dataset size: 16
Labels in dataset: ['F' 'T'], amount: 2
Number of features: 4

****************************************************************************************************
Current dataset: breast_cancer_wisconsin_diagnostic
No rows were removed.
Dataset size: 569
Labels in dataset: ['B' 'M'], amount: 2
Number of features: 30

*******************************