In [111]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import adjusted_mutual_info_score
import random

random.seed(7)

In [112]:
# MAKE SURE TO CHANGE THIS TO THE LOCAL PATH TO DATA
#TRAIN_PATH = "../augmented_train/reformatted_train_106800_samples.csv"
TRAIN_PATH = "train_15000_samples_fft_0_to_10_hz_consensus_1.0_balanced.csv"

data = pd.read_csv(TRAIN_PATH)

data_np = data.to_numpy()

In [113]:
num_input = data_np.shape[1] - 1

x = data_np[:, 0:num_input]
y = data_np[:, num_input]

# Map labels to numerical values
label_mapping = {'GPD': 0, 'GRDA': 1, 'LPD': 2, 'LRDA': 3, 'Seizure': 4, 'Other': 5}
y = np.array([label_mapping[label] for label in y])

# Shuffle the dataset based on sample indices
shuffled_indices = np.random.permutation(x.shape[0])

# Choose the first 80% as training set, next 10% as validation and the rest as testing
train_split_idx = int(0.80 * x.shape[0])
val_split_idx = int(0.90 * x.shape[0])

train_indices = shuffled_indices[0:train_split_idx]
val_indices = shuffled_indices[train_split_idx:val_split_idx]
test_indices = shuffled_indices[val_split_idx:]

# Select the examples from x and y to construct our training, validation, testing sets
x_train, y_train = x[train_indices, :], y[train_indices]
x_val, y_val = x[val_indices, :], y[val_indices]
x_test, y_test = x[test_indices, :], y[test_indices]

In [114]:
n_clusters = 6  # Define the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=7)  # Create a KMeans clustering model with 6 clusters
kmeans.fit(x_train)  # Fit the KMeans model to the training data

train_labels = kmeans.labels_  # Get the cluster labels assigned to each training data point
score_train = silhouette_score(x_train, train_labels)  # Calculate the silhouette score for the training data

val_labels = kmeans.predict(x_val)  # Predict cluster labels for the validation data using the trained model
score_val = silhouette_score(x_val, val_labels)  # Calculate the silhouette score for the validation data

print('{} Clusters - Training silhouette score: {:0.5f}  Validation silhouette score: {:0.5f}'.format(n_clusters, score_train, score_val))  # Print the silhouette scores for training and validation data

print(f"The best model has {n_clusters} clusters")  # Print the number of clusters in the best model

# Evaluate the best model on the test set
test_labels = kmeans.predict(x_test)  # Predict cluster labels for the test data using the best model

test_silhouette = silhouette_score(x_test, test_labels)  # Calculate the silhouette score for the test data
test_adj_rand = adjusted_rand_score(y_test, test_labels)  # Calculate the adjusted Rand index for the test data
test_mi = adjusted_mutual_info_score(y_test, test_labels)  # Calculate the adjusted mutual information for the test data

# Print the evaluation metrics for the best model on the test data
print("The test silhouette score for the best model:", test_silhouette)
print("The test adjusted rand score for the best model:", test_adj_rand)
print("The test mutual information score for the best model:", test_mi)

# As the results below show, there appears to be no indication that the clustering algorithm learned the same classes as our ground truths.

6 Clusters - Training silhouette score: 0.95097  Validation silhouette score: 0.95582
The best model has 6 clusters
The test silhouette score for the best model: 0.9560952240580038
The test adjusted rand score for the best model: 0.0005908856647565565
The test mutual information score for the best model: 0.009274057293535348
