# Mini-Project
This is the main mini-project notebook handeling pre-processing, model definition, training, and evalutaion. 

### Imports
The first cell block handels some necessary imports.

In [47]:
# Import Generic dataset proceessing class.
from Project.generic_dataset import GenericDataset
from ucimlrepo import fetch_ucirepo
import numpy as np
import random

In [48]:
from sklearn.cluster import KMeans
def run_kmeans_random_on_ds(ds,*,validation = False, **kwargs):

    # Get training data.
    (X_train,L_train) = ds.get_train()

    # Fit data.
    kmeans = KMeans(n_clusters=ds.get_metadata()["num_classes"], random_state=0, n_init="auto", init="random").fit(X_train)

    # Find which training label is most correlated to which cluster.
    correlation_matrix = np.zeros((ds.get_metadata()["num_classes"],ds.get_metadata()["num_classes"]))
    for cluster, label in zip(list(kmeans.labels_),list(L_train)):
        correlation_matrix[cluster,label] += 1

    cluster2label = np.argmax(correlation_matrix,axis=1)

    # Fetch Test/Validation set.
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()

    # Get predictions.
    predicted_cluster = kmeans.predict(X_test)
    L_pred = cluster2label[predicted_cluster]

    # Return accuracy.
    return ((sum(L_pred.squeeze()==L_test.squeeze())/L_test.squeeze().shape)[0]).item()

In [49]:
from sklearn.cluster import KMeans
def run_kmeans_on_ds(ds,*,validation = False, **kwargs):

    # Get training data.
    (X_train,L_train) = ds.get_train()

    # Fit data.
    kmeans = KMeans(n_clusters=ds.get_metadata()["num_classes"], n_init="auto").fit(X_train)

    # Find which training label is most correlated to which cluster.
    correlation_matrix = np.zeros((ds.get_metadata()["num_classes"],ds.get_metadata()["num_classes"]))
    for cluster, label in zip(list(kmeans.labels_),list(L_train)):
        correlation_matrix[cluster,label] += 1

    cluster2label = np.argmax(correlation_matrix,axis=1)

    # Fetch Test/Validation set.
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()
    
    # Get predictions.
    predicted_cluster = kmeans.predict(X_test)
    L_pred = cluster2label[predicted_cluster]

    # Return accuracy.
    return ((sum(L_pred.squeeze()==L_test.squeeze())/L_test.squeeze().shape)[0]).item()

In [50]:
from sklearn.cluster import Birch
def run_birch_on_ds(ds,branching_factor=50, threshold=0.5,*,validation = False, **kwargs):

    # Get training data.
    (X_train,L_train) = ds.get_train()

    # Fit data.
    birch = Birch(n_clusters=ds.get_metadata()["num_classes"], branching_factor=branching_factor, threshold=threshold).fit(X_train)

    # Find which training label is most correlated to which cluster.
    correlation_matrix = np.zeros((len(np.unique(birch.labels_)),ds.get_metadata()["num_classes"]))
    for cluster, label in zip(list(birch.labels_),list(L_train)):
        correlation_matrix[cluster,label] += 1

    cluster2label = np.argmax(correlation_matrix,axis=1)

    # Fetch Test/Validation set.
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()
    
    # Get predictions.
    predicted_cluster = birch.predict(X_test)
    L_pred = cluster2label[predicted_cluster]

    # Return accuracy.
    return ((sum(L_pred.squeeze()==L_test.squeeze())/L_test.squeeze().shape)[0]).item()

In [51]:
import ANN
def run_ann_on_ds(ds,seed=0,nepochs=20,batch_size=3,learning_rate=0.05,layer_configuration=[100,100],*,validation = False, **kwargs):
    
    # Get training data, validation data, and test data. (if validation is true the validation set is used for evaluation)
    (X_train, L_train) = ds.get_train()
    (X_val, L_val) = ds.get_val()
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()

    # Get train and validation batches.
    train_data, train_labels, valid_data, valid_labels=ANN.prepare_for_backprop(batch_size, X_train, L_train, X_val, L_val, nclasses=ds.get_metadata()["num_classes"])

    # Find MLP with highest validation accuracy to avoid overfitting. (Early stopping)
    mlp = ANN.MultiLayerPerceptron(layer_config=[ds.get_metadata()["num_features"]] + layer_configuration + [ds.get_metadata()["num_classes"]], batch_size=batch_size,seed=seed)
    best_mlp = mlp.evaluate(train_data, train_labels, valid_data, valid_labels,
             eval_train=False, eval_test=False, num_epochs=nepochs, eta=learning_rate)

    # Get test bathces.
    test_data, test_labels = ANN.create_batches(X_test, L_test,
                                              batch_size=batch_size,
                                              create_bit_vector=True,
                                              nclasses=ds.get_metadata()["num_classes"])

    # Return accuracy of test batches on the best MLP.
    return best_mlp.get_accuracy(test_data, test_labels)

In [52]:
import som
def run_som_on_ds(ds,ndim=16,nepochs=10,*,validation = False, **kwargs):
    
    # Get training data.
    (X_train, L_train) = ds.get_train()
    L_train = list(L_train)

    # Get test data.
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()
    L_test = list(L_test)

    # Train a som.
    trained_som = som.SOM(X_train,nepochs=nepochs,ndim=ndim)

    # Get confusion matrix on test set and get accuracy.
    _, conf = som.SOM_Test(trained_som,X_train,L_train,X_test,L_test,ds.get_metadata()["num_classes"],ndim=ndim)
    accuracy = (np.sum(np.eye(conf.shape[0])*conf)/np.sum(conf)).item()
    return accuracy

In [53]:
from sklearn.svm import SVC
def run_svm_on_ds(ds,kernel='rbf',*,validation = False, **kwargs):
    
    # Get training data.
    (X,Y) = ds.get_train()
    
    # Fit training data.
    clf = SVC(kernel=kernel).fit(X,Y)
    
    # Get test data.
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()
    
    # Get predictions and return accuracy.
    Y_pred = clf.predict(X_test)
    return (sum(Y_pred.squeeze()==L_test.squeeze())/L_test.squeeze().shape)[0].item()

In [54]:
from sklearn.neighbors import KNeighborsClassifier
def run_knn_on_ds(ds,n_neighbors=5,weights='uniform',*,validation = False, **kwargs):
    
    # Get training data.
    (X,Y) = ds.get_train()
    
    # Fit training data.
    clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(X,Y)
    
    # Get test data.
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()
    
    # Get predictions and return accuracy.
    Y_pred = clf.predict(X_test)
    return (sum(Y_pred.squeeze()==L_test.squeeze())/L_test.squeeze().shape)[0].item()

In [55]:
from sklearn.ensemble import RandomForestClassifier
def run_random_forest_on_ds(ds,n_estimators=100,max_depth=None,*,validation = False, **kwargs):

    # Get training data.
    (X_train,L_train) = ds.get_train()
    
    # Fit training data.
    clf = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth).fit(X_train,L_train)
    
    # Get test data.
    if validation:
        (X_test,L_test) = ds.get_val()
    else:
        (X_test,L_test) = ds.get_test()
    
    # Get predictions and return accuracy.
    L_pred = clf.predict(X_test)
    return (sum(L_pred.squeeze()==L_test.squeeze())/L_test.squeeze().shape)[0].item()

In [56]:
from time import perf_counter
from tqdm import tqdm

def validate(ds,func,seeds, display_bar = True,**kwargs):
    '''
        Runs model with multiple hyperparameters multiple times and finds the best parameters. And evaluates on the test set.
    Args:
        ds: Dataset to evaluate on.
        func: Function that returns accuracy for a certain model.
        seeds: Which seeds to use for evalutaion
        **kwargs: Dictionary with hyperparameters and list of values to run. 
    Return:
        accuracy: Returns the accuracy.
        best_kwargs: Returns the parameters with best accuracy.
    '''


    keys = list(kwargs.keys())
    indexes = [0]*len(keys)

    # Gets total number of permutations.
    prod = 1
    for e in [len(l) for l in list(kwargs.values())]:
        prod *= e
    if display_bar:
        bar = tqdm(total=prod)

    # Init best parameters.
    best_accuracy = 0
    best_time = float('inf')
    best_kwargs = {}

    while True:
        # Get permutation to try.
        new_kwargs = dict([(key,kwargs[key][index]) for key,index in zip(keys,indexes)])

        # Average the accuracy over 3 runs.
        t1 = perf_counter()
        sum_accuracy = 0
        samples_for_val = 3
        for seed in seeds[:samples_for_val]:
            np.random.seed(seed)
            random.seed(seed)
            new_kwargs['seed']=seed
            try:
                sum_accuracy += func(ds,**new_kwargs, validation = True)
            except ValueError as e:
                print("Value error in function,excluding from average: ", str(e))
        

        time = (perf_counter()-t1)/samples_for_val
        accuracy = sum_accuracy/samples_for_val

        # Update best if current accuracy is better than best, if equal take model with least time.
        if accuracy>best_accuracy: #or (accuracy==best_accuracy and time<best_time):
            best_accuracy=accuracy
            best_time=time
            best_kwargs = new_kwargs

        # Update progressbar.
        if display_bar:
            bar.update(1)

        # Break if no premutations.
        if len(indexes)==0:
            break
        
        # Increase index array in right way.
        indexes[0]+=1
        for i in range(len(indexes)-1):
            if indexes[i]>=len(kwargs[keys[i]]):
                indexes[i]=0
                indexes[i+1] += 1
        # Break loop when finished.
        if indexes[-1]>=len(kwargs[keys[-1]]):
            break
    
    # Average accuracies on best kwargs, for the test set.
    sum_accuracy = 0
    for seed in seeds:
        np.random.seed(seed)
        random.seed(seed)
        try:
            sum_accuracy += func(ds,**best_kwargs, validation = False)
        except ValueError as e:
            print("Value error in function,excluding from average: ", str(e))
    accuracy = sum_accuracy/len(seeds)
        
    return accuracy, best_kwargs, best_time

In [57]:
dataset_ids = [12, 14, 15, 16, 17, 19, 22, 27, 28, 30, 32, 33, 39, 42, 43, 44, 45, 50, 52, 53, 69, 70, 74, 78, 83, 90, 95, 96, 101, 107, 109, 110, 111, 143, 144, 145, 147, 149, 151, 161, 174, 176, 186, 212, 225, 244, 257, 267, 277, 292, 329, 336, 342, 379, 419, 426, 445, 451, 503, 519, 529, 544, 545, 547, 563, 565, 582, 697, 732, 759, 850, 857, 863, 878, 887, 915, 936]

print("number of datasets:",len(dataset_ids))
print()

sum_accuracy_SVM = 0
sum_accuracy_SOM = 0
sum_accuracy_ANN = 0
sum_accuracy_KMEANS = 0
sum_accuracy_KMEANS_RANDOM = 0
sum_accuracy_BIRCH = 0
sum_accuracy_RFC = 0
sum_accuracy_KNN = 0

results_accuracy = {}
seeds = [1234,2345,3456,4567,5678]
for i, id in enumerate(dataset_ids):
    
    np.random.seed(seeds[0])
    random.seed(seeds[0])
    print("id:",id)
    ds = GenericDataset(id, splits=(0.5, 0.3, 0.2), show_info=False)
    print("name:",ds.get_metadata()['name'],"\t","num_instances:",ds.get_metadata()['num_instances'],"\t","num_classes:",ds.get_metadata()['num_classes'],"\t","num_features:",ds.get_metadata()['num_features'])

    dataset_dict = {}
    
    accuracy, best_params, best_time = validate(ds,run_svm_on_ds,seeds, display_bar=False,**{'kernel':['rbf','poly','linear']})
    sum_accuracy_SVM += accuracy
    dataset_dict['SVM'] = [accuracy, best_params, best_time]
    print("SVM:",str(round(accuracy*1000)/10) + "%", end="\t")

    accuracy, best_params, best_time = validate(ds,run_som_on_ds,seeds, display_bar=False,**{'ndim':[8,16],'nepochs':[25]})
    sum_accuracy_SOM += accuracy
    dataset_dict['SOM'] = [accuracy, best_params, best_time]
    print("SOM:",str(round(accuracy*1000)/10) + "%", end="\t")

    accuracy, best_params, best_time = validate(ds,run_ann_on_ds,seeds, display_bar=False,**{'nepochs':[40],'batch_size':[7,10],'learning_rate':[0.05,0.10],'layer_configuration':[[100,100],[50]]})
    sum_accuracy_ANN += accuracy
    dataset_dict['ANN'] = [accuracy, best_params, best_time]
    print("ANN:",str(round(accuracy*1000)/10) + "%", end="\t")

    accuracy, best_params, best_time = validate(ds,run_kmeans_on_ds,seeds, display_bar=False)
    sum_accuracy_KMEANS += accuracy
    dataset_dict['KMEANS'] = [accuracy, best_params, best_time]
    print("KMEANS:",str(round(accuracy*1000)/10) + "%", end="\t")

    accuracy, best_params, best_time = validate(ds,run_kmeans_random_on_ds,seeds, display_bar=False)
    sum_accuracy_KMEANS_RANDOM += accuracy
    dataset_dict['KMEANS_RANDOM'] = [accuracy, best_params, best_time]
    print("KMEANS_RANDOM:",str(round(accuracy*1000)/10) + "%", end="\t")

    accuracy, best_params, best_time = validate(ds,run_birch_on_ds,seeds, display_bar=False,**{'branching_factor':[25,50,75], 'threshold':[0.25,0.5,0.75]})
    sum_accuracy_BIRCH += accuracy
    dataset_dict['BIRCH'] = [accuracy, best_params, best_time]
    print("BIRCH:",str(round(accuracy*1000)/10) + "%", end="\t")

    accuracy, best_params, best_time = validate(ds,run_random_forest_on_ds,seeds, display_bar=False,**{'max_depth':[None,3,5,7,15],'n_estimators':[200,100,10]})
    sum_accuracy_RFC += accuracy
    dataset_dict['RFC'] = [accuracy, best_params, best_time]
    print("RFC:",str(round(accuracy*1000)/10) + "%", end="\t")

    accuracy, best_params, best_time = validate(ds,run_knn_on_ds,seeds, display_bar=False,**{'n_neighbors':[3,5,7,11,17],'weights':['uniform','distance']})
    sum_accuracy_KNN += accuracy
    dataset_dict['KNN'] = [accuracy, best_params, best_time]
    print("KNN:",str(round(accuracy*1000)/10) + "%")

    results_accuracy[ds.get_metadata()['name']] = dataset_dict
    print()
    avg_accuracy_SVM = sum_accuracy_SVM/(i+1)
    avg_accuracy_SOM = sum_accuracy_SOM/(i+1)
    avg_accuracy_ANN = sum_accuracy_ANN/(i+1)
    avg_accuracy_KMEANS = sum_accuracy_KMEANS/(i+1)
    avg_accuracy_KMEANS_RANDOM = sum_accuracy_KMEANS_RANDOM/(i+1)
    avg_accuracy_BIRCH = sum_accuracy_BIRCH/(i+1)
    avg_accuracy_RFC = sum_accuracy_RFC/(i+1)
    avg_accuracy_KNN = sum_accuracy_KNN/(i+1)

    if i%5==0:
        print("---Supervised---")
        
        print("Average accuracy SVM:", str(round(avg_accuracy_SVM*1000)/10)+"%")
        print("Average accuracy RFC:", str(round(avg_accuracy_RFC*1000)/10)+"%")
        print("Average accuracy KNN:", str(round(avg_accuracy_KNN*1000)/10)+"%")
        print("Average accuracy ANN:", str(round(avg_accuracy_ANN*1000)/10)+"%")
    
        print("---Unsupervised---")
    
        print("Average accuracy SOM:", str(round(avg_accuracy_SOM*1000)/10)+"%")
        print("Average accuracy KMEANS:", str(round(avg_accuracy_KMEANS*1000)/10)+"%")
        print("Average accuracy KMEANS_RANDOM:", str(round(avg_accuracy_KMEANS_RANDOM*1000)/10)+"%")
        print("Average accuracy BIRCH:", str(round(avg_accuracy_BIRCH*1000)/10)+"%")
    
        print()

avg_accuracy_SVM = sum_accuracy_SVM/len(dataset_ids)
avg_accuracy_SOM = sum_accuracy_SOM/len(dataset_ids)
avg_accuracy_ANN = sum_accuracy_ANN/len(dataset_ids)
avg_accuracy_KMEANS = sum_accuracy_KMEANS/len(dataset_ids)
avg_accuracy_KMEANS_RANDOM = sum_accuracy_KMEANS_RANDOM/len(dataset_ids)
avg_accuracy_BIRCH = sum_accuracy_BIRCH/len(dataset_ids)
avg_accuracy_RFC = sum_accuracy_RFC/len(dataset_ids)
avg_accuracy_KNN = sum_accuracy_KNN/len(dataset_ids)

print("\n"*3)
print("---Supervised---")
print("Average accuracy SVM:", str(round(avg_accuracy_SVM*1000)/10)+"%")
print("Average accuracy Random Forest Classifier:", str(round(avg_accuracy_RFC*1000)/10)+"%")
print("Average accuracy KNN:", str(round(avg_accuracy_KNN*1000)/10)+"%")
print("Average accuracy ANN:", str(round(avg_accuracy_ANN*1000)/10)+"%")
print("---Unsupervised---")
print("Average accuracy SOM:", str(round(avg_accuracy_SOM*1000)/10)+"%")
print("Average accuracy KMEANS:", str(round(avg_accuracy_KMEANS*1000)/10)+"%")
print("Average accuracy KMEANS_RANDOM:", str(round(avg_accuracy_KMEANS_RANDOM*1000)/10)+"%")
print("Average accuracy BIRCH:", str(round(avg_accuracy_BIRCH*1000)/10)+"%")

number of datasets: 77

id: 12
name: Balance Scale 	 num_instances: 625 	 num_classes: 3 	 num_features: 4
SVM: 94.4%	SOM: 73.6%	ANN: 95.0%	KMEANS: 66.4%	KMEANS_RANDOM: 72.0%	BIRCH: 64.8%	RFC: 92.2%	KNN: 92.8%

---Supervised---
Average accuracy SVM: 94.4%
Average accuracy RFC: 92.2%
Average accuracy KNN: 92.8%
Average accuracy ANN: 95.0%
---Unsupervised---
Average accuracy SOM: 73.6%
Average accuracy KMEANS: 66.4%
Average accuracy KMEANS_RANDOM: 72.0%
Average accuracy BIRCH: 64.8%

id: 14
name: Breast Cancer 	 num_instances: 286 	 num_classes: 2 	 num_features: 39
SVM: 77.6%	SOM: 77.6%	ANN: 75.0%	KMEANS: 75.9%	KMEANS_RANDOM: 75.9%	BIRCH: 75.9%	RFC: 79.7%	KNN: 75.9%

id: 15
name: Breast Cancer Wisconsin (Original) 	 num_instances: 699 	 num_classes: 3 	 num_features: 9
SVM: 96.4%	SOM: 96.4%	ANN: 97.0%	KMEANS: 97.1%	KMEANS_RANDOM: 97.1%	BIRCH: 95.6%	RFC: 96.4%	KNN: 97.1%

id: 16
name: Breast Cancer Wisconsin (Prognostic) 	 num_instances: 198 	 num_classes: 2 	 num_features: 33
SVM: 76.9%

  1 / (1 + np.exp(-X)), # For positive values


ANN: 96.7%	KMEANS: 91.4%	KMEANS_RANDOM: 91.2%	BIRCH: 91.8%	RFC: 96.9%	KNN: 96.1%

id: 83
name: Primary Tumor 	 num_instances: 339 	 num_classes: 22 	 num_features: 17
SVM: 37.0%	SOM: 18.5%	ANN: 38.1%	KMEANS: 29.6%	KMEANS_RANDOM: 29.6%	BIRCH: 25.9%	RFC: 34.8%	KNN: 37.0%

id: 90
name: Soybean (Large) 	 num_instances: 307 	 num_classes: 15 	 num_features: 35
SVM: 85.2%	SOM: 50.0%	ANN: 79.6%	KMEANS: 57.8%	KMEANS_RANDOM: 57.4%	BIRCH: 70.4%	RFC: 89.3%	KNN: 81.5%

---Supervised---
Average accuracy SVM: 81.4%
Average accuracy RFC: 83.4%
Average accuracy KNN: 78.8%
Average accuracy ANN: 80.3%
---Unsupervised---
Average accuracy SOM: 68.2%
Average accuracy KMEANS: 66.7%
Average accuracy KMEANS_RANDOM: 66.6%
Average accuracy BIRCH: 66.7%

id: 95
name: SPECT Heart 	 num_instances: 267 	 num_classes: 2 	 num_features: 22
SVM: 74.1%	SOM: 63.0%	ANN: 80.0%	KMEANS: 72.2%	KMEANS_RANDOM: 72.2%	BIRCH: 72.2%	RFC: 81.5%	KNN: 85.2%

id: 96
name: SPECTF Heart 	 num_instances: 267 	 num_classes: 2 	 num_featur

In [58]:
keys = list(results_accuracy.keys())
models = list(results_accuracy[keys[0]].keys())
sums = dict([(model,0) for model in models])

for key in keys:
    for model in models:
        sums[model] += results_accuracy[key][model][0]

avg = dict([(model,sums[model]/len(results_accuracy)) for model in models])
print(avg)

{'SVM': 0.8124235137220487, 'SOM': 0.6882077838700866, 'ANN': 0.8068783755255902, 'KMEANS': 0.6746514312832327, 'KMEANS_RANDOM': 0.6722540107061312, 'BIRCH': 0.6731239221554253, 'RFC': 0.8273331976744414, 'KNN': 0.789264983736093}


In [59]:
import datetime
import json 
  
with open('result_'+datetime.datetime.now().strftime("%I_%M_%d_%m_%Y")+'.json', 'w') as file: 
     file.write(json.dumps(results_accuracy))