In [1]:
import os
import numpy as np
import random
import csv
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
from dataset import dataset
from knn import knn
from enn import enn
from kmeans import kmeans

In [3]:
def process_all(user: str, shuffle_split: bool):
    abalone_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/abalone.data', False)
    cancer_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/breast-cancer-wisconsin.data', False)
    fire_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/forestfires.data', False)
    glass_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/glass.data', False)
    machine_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/machine.data', False)
    soybean_data = dataset('/home/'+user+'/CSCI_447/Project_2/Datasets/soybean-small.data', False)

    abalone_data.continuize()
    abalone_data.shuffle()
    abalone_data.sort('regression')
    abalone_data.split()
    abalone_data.fold()

    cancer_data.remove_attribute()
    cancer_data.impute()
    cancer_data.shuffle()
    cancer_data.sort('classification')
    cancer_data.split()
    cancer_data.fold()

    fire_data.continuize()
    fire_data.shuffle()
    fire_data.sort('regression')
    fire_data.split()
    fire_data.fold()

    glass_data.remove_attribute()
    glass_data.shuffle()
    glass_data.sort('classification')
    glass_data.split()
    glass_data.fold()

    machine_data.continuize()
    machine_data.shuffle()
    machine_data.sort('regression')
    machine_data.split()
    machine_data.fold()

    soybean_data.continuize()
    soybean_data.shuffle()
    soybean_data.sort('classification')
    soybean_data.split()
    soybean_data.fold()

    if (shuffle_split == True) :
        abalone_data.shuffle_splits()
        cancer_data.shuffle_splits()
        fire_data.shuffle_splits()
        glass_data.shuffle_splits()
        machine_data.shuffle_splits()
        soybean_data.shuffle_splits()

    abalone_data.save('abalone')
    cancer_data.save('cancer')
    fire_data.save('fire')
    glass_data.save('glass')
    machine_data.save('machine')
    soybean_data.save('soybean')

    return abalone_data, cancer_data, fire_data, glass_data, machine_data, soybean_data

In [4]:
abalone_data, cancer_data, fire_data, glass_data, machine_data, soybean_data = process_all('carlthedog3', True)

In [None]:
'''
Test code to see if all methods are outputting 10x2 array
'''

cancer_kmeans = kmeans(cancer_data,'classification')
fire_kmeans = kmeans(fire_data, 'regression')
cancer_knn = knn(cancer_data, 'classification')
fire_knn = knn(fire_data, 'regression')
cancer_enn = enn(cancer_data, 'classification', k_n=cancer_knn.k_n, sigma=cancer_knn.sigma)
fire_enn = enn(fire_data, 'regression', k_n=fire_knn.k_n, sigma=fire_knn.sigma)

cancer_kmeans.tune()
fire_kmeans.tune()
cancer_knn.tune(25)
fire_knn.tune(15)
cancer_enn.tune(15)
fire_enn.tune(10)

cancer_kmeans_results = cancer_kmeans.classify()
fire_kmeans_results = fire_kmeans.regress()
cancer_knn_results = cancer_knn.classify()
fire_knn_results = fire_knn.regress()
cancer_enn_results = cancer_enn.classify()
fire_enn_results = fire_enn.regress()

print(f"Cancer KNN Loss:\n{cancer_knn_results}")
print(f"Fire KNN Loss:\n{fire_knn_results}")
print(f"Cancer ENN Loss:\n{cancer_enn_results}")
print(f"Fire ENN Loss:\n{fire_enn_results}")
print(f"Cancer K-Means Loss:\n{cancer_kmeans_results}")
print(f"Fire K-Means Loss:\n{fire_kmeans_results}")

In [None]:

cancer_kmeans = kmeans(cancer_data,'classification')
glass_kmeans = kmeans(glass_data,'classification')
soybean_kmeans = kmeans(soybean_data,'classification')
abalone_kmeans = kmeans(abalone_data, 'regression')
fire_kmeans = kmeans(fire_data, 'regression')
machine_kmeans = kmeans(machine_data, 'regression')

cancer_kmeans.tune()
glass_kmeans.tune()
soybean_kmeans.tune()
abalone_kmeans.tune()
fire_kmeans.tune()
machine_kmeans.tune()

cancer_kmeans_results = cancer_kmeans.classify()
glass_kmeans_results = glass_kmeans.classify()
soybean_kmeans_results = soybean_kmeans.classify()
abalone_kmeans_results = abalone_kmeans.regress()
fire_kmeans_results = fire_kmeans.regress()
machine_kmeans_results = machine_kmeans.regress()

# Used to see if kmeans cancer ran without issues
print(f"Cancer K-Means Loss:\n{cancer_kmeans_results}")
'''
print(f"Glass K-Means Loss:\n{glass_kmeans_results}")
print(f"Soybean K-Means Loss:\n{soybean_kmeans_results}")
print(f"Abalone K-Means Loss:\n{abalone_kmeans_results}")
print(f"Fire K-Means Loss:\n{fire_kmeans_results}")
print(f"Machine K-Means Loss:\n{machine_kmeans_results}")
'''

In [None]:
cancer_knn = knn(cancer_data, 'classification')
glass_knn = knn(glass_data, "classification")
soybean_knn = knn(soybean_data, "classification")
abalone_knn = knn(abalone_data, 'regression')
fire_knn = knn(fire_data, 'regression')
machine_knn = knn(machine_data, 'regression')

cancer_knn.tune(25)
glass_knn.tune(15)
soybean_knn.tune(10)
abalone_knn.tune(5)
fire_knn.tune(15)
machine_knn.tune(10)

cancer_enn = enn(cancer_data, 'classification', k_n=cancer_knn.k_n, sigma=cancer_knn.sigma)
glass_enn = enn(glass_data, "classification", k_n=glass_knn.k_n, sigma=glass_knn.sigma)
soybean_enn = enn(soybean_data, "classification", k_n=soybean_knn.k_n, sigma=soybean_knn.sigma)
abalone_enn = enn(abalone_data, 'regression', k_n=abalone_knn.k_n, sigma=abalone_knn.sigma)
fire_enn = enn(fire_data, 'regression', k_n=fire_knn.k_n, sigma=fire_knn.sigma)
machine_enn = enn(machine_data, 'regression', k_n=machine_knn.k_n, sigma=machine_knn.sigma)

cancer_enn.tune(15)
glass_enn.tune(10)
soybean_enn.tune(10)
abalone_enn.tune(5)
fire_enn.tune(10)
machine_enn.tune(25)

'''
cancer_kmeans = kmeans(cancer_data,'classification')
glass_kmeans = kmeans(glass_data,'classification')
soybean_kmeans = kmeans(soybean_data,'classification')
abalone_kmeans = kmeans(abalone_data, 'regression')
fire_kmeans = kmeans(fire_data, 'regression')
machine_kmeans = kmeans(machine_data, 'regression')

cancer_kmeans.tune()
glass_kmeans.tune()
soybean_kmeans.tune()
abalone_kmeans.tune()
fire_kmeans.tune()
machine_kmeans.tune()
'''

cancer_knn_results = cancer_knn.classify()
glass_knn_results = glass_knn.classify()
soybean_knn_results = soybean_knn.classify()
abalone_knn_results = abalone_knn.regress()
fire_knn_results = fire_knn.regress()
machine_knn_results = machine_knn.regress()

cancer_enn_results = cancer_enn.classify()
glass_enn_results = glass_enn.classify()
soybean_enn_results = soybean_enn.classify()
abalone_enn_results = abalone_enn.regress()
fire_enn_results = fire_enn.regress()
machine_enn_results = machine_enn.regress()

'''
cancer_kmeans_results = cancer_kmeans.classify()
glass_kmeans_results = glass_kmeans.classify()
soybean_kmeans_results = soybean_kmeans.classify()
abalone_kmeans_results = abalone_kmeans.regress()
fire_kmeans_results = fire_kmeans.regress()
machine_kmeans_results = machine_kmeans.regress()
'''

print(f"Cancer KNN Loss:\n{cancer_knn_results}")
print(f"Glass KNN Loss:\n{glass_knn_results}")
print(f"Soybean KNN Loss:\n{soybean_knn_results}")
print(f"Abalone KNN Loss:\n{abalone_knn_results}")
print(f"Fire KNN Loss:\n{fire_knn_results}")
print(f"Machine KNN Loss:\n{machine_knn_results}")

print(f"Cancer ENN Loss:\n{cancer_enn_results}")
print(f"Glass ENN Loss:\n{glass_enn_results}")
print(f"Soybean ENN Loss:\n{soybean_enn_results}")
print(f"Abalone ENN Loss:\n{abalone_enn_results}")
print(f"Fire ENN Loss:\n{fire_enn_results}")
print(f"Machine ENN Loss:\n{machine_enn_results}")

print(f"Cancer K-Means Loss:\n{cancer_kmeans_results}")
print(f"Glass K-Means Loss:\n{glass_kmeans_results}")
print(f"Soybean K-Means Loss:\n{soybean_kmeans_results}")
print(f"Abalone K-Means Loss:\n{abalone_kmeans_results}")
print(f"Fire K-Means Loss:\n{fire_kmeans_results}")
print(f"Machine K-Means Loss:\n{machine_kmeans_results}")


In [None]:
def make_plots(f1_scores, loss_scores, dataset_names, figure_size, rotation_val):
    '''
    This function creates the boxplots shown in our submitted paper.
    '''

    num_datasets = len(dataset_names)
    num_models = 3  # KNN, ENN, K-Means

    # Prepare data for F1 Scores and Loss Scores
    f1_data = [f1_scores[i] for i in range(len(f1_scores))]
    loss_data = [loss_scores[i] for i in range(len(loss_scores))]

    # Positions for boxplots - 3 models per dataset
    positions_f1 = []
    positions_loss = []
    width = 0.25
    spacing = 1.0

    for i in range(num_datasets):
        base_position = i * spacing * num_models
        positions_f1.extend([base_position, base_position + width, base_position + 2 * width])
        positions_loss.extend([base_position, base_position + width, base_position + 2 * width])

    plt.figure(figsize=figure_size)
    plt.boxplot(f1_data, positions=positions_f1, widths=width, patch_artist=True,
                boxprops=dict(facecolor='lightblue'), medianprops=dict(color='blue'),
                whiskerprops=dict(color='blue'), capprops=dict(color='blue'),
                flierprops=dict(markerfacecolor='blue', marker='o'))
    # Add labels at dataset positions
    plt.xticks([i * spacing * num_models + width for i in range(num_datasets)], dataset_names, rotation=rotation_val)
    plt.xlabel('Datasets')
    plt.ylabel('Average F1 Scores Across Classes')
    plt.title('Average F1 Scores Across Datasets (Higher is Better)')
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=figure_size)
    plt.boxplot(loss_data, positions=positions_loss, widths=width, patch_artist=True,
                boxprops=dict(facecolor='lightcoral'), medianprops=dict(color='red'),
                whiskerprops=dict(color='red'), capprops=dict(color='red'),
                flierprops=dict(markerfacecolor='red', marker='o'))
    # Add labels at dataset positions
    plt.xticks([i * spacing * num_models + width for i in range(num_datasets)], dataset_names, rotation=rotation_val)
    plt.xlabel('Datasets')
    plt.ylabel('0/1 Loss Score')
    plt.title('0/1 Loss Scores Across Datasets (Lower is Better)')
    plt.tight_layout()
    plt.show()

'''
# Example usage
f1_scores = [np.random.rand(10) for _ in range(9)]  # Mock data for 9 arrays
loss_scores = [np.random.rand(10) for _ in range(9)]  # Mock data for 9 arrays
dataset_names = ['Dataset 1', 'Dataset 2', 'Dataset 3']
make_plots(f1_scores, loss_scores, dataset_names, (10, 6), 45)
'''