<a href="https://colab.research.google.com/github/Ashail33/Masters-work/blob/master/Call_all_functions_and_run_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import defaultdict
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score, adjusted_rand_score, normalized_mutual_info_score
from scipy.spatial.distance import cdist
import time
import psutil
import os
import tracemalloc
from google.colab import drive
import glob
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
import json

#evaluation code
class Timer:
    def __enter__(self):
        self.start_time = time.monotonic()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.interval = time.monotonic() - self.start_time


class MemoryMonitor:
    def __enter__(self):
        self.process = psutil.Process(os.getpid())
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.memory_info = self.process.memory_info()


class DiskMonitor:
    def __enter__(self):
        self.disk_read_start = psutil.disk_io_counters().read_bytes
        self.disk_write_start = psutil.disk_io_counters().write_bytes
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.disk_read_end = psutil.disk_io_counters().read_bytes
        self.disk_write_end = psutil.disk_io_counters().write_bytes
        self.disk_read_bytes = self.disk_read_end - self.disk_read_start
        self.disk_write_bytes = self.disk_write_end - self.disk_write_start


def compactness(X, labels):
    centroids = np.array([np.mean(X[np.where(labels == i)], axis=0) for i in set(labels.flatten())])
    distances = cdist(X, centroids)
    return np.sum(np.min(distances, axis=1))


def separation(X, labels):
    centroids = np.array([np.mean(X[np.where(labels == i)], axis=0) for i in set(labels.flatten())])
    return np.sum(cdist(centroids, centroids))



def dunn_index(X, labels):
    k = len(set(labels.flatten()))
    clusters = [X[np.where(labels == i)] for i in range(k)]
    cluster_distances = np.array([np.max(cdist(c1, c2)) for i, c1 in enumerate(clusters) for j, c2 in enumerate(clusters) if i < j])
    
    pairwise_distances = cdist(X, X)
    min_inter_cluster_distances = np.min(pairwise_distances[np.where(labels[:, None] != labels[None, :])])
    
    return min_inter_cluster_distances / np.max(cluster_distances)

# code to evaluate the clustering algorithms
def evaluate_clustering(algorithm, dataset, true_labels=None, n_runs=10):
    results = defaultdict(list)

    metric_function_map = {
        'cp': compactness,
        'sp': separation,
        'db': davies_bouldin_score,
        'silhouette': silhouette_score,
        'calinski_harabasz': calinski_harabasz_score,
        'ari': adjusted_rand_score,
        'nmi': normalized_mutual_info_score,
        'dvi': dunn_index,
    }

    previous_runs_labels = []
    previous_runs_results = []

    # Start memory monitoring before the loop
    tracemalloc.start()

    for _ in range(n_runs):
        with Timer() as t, DiskMonitor() as d:
            labels = algorithm.fit_predict(dataset)

        # Memory usage tracking
        current_memory, peak_memory = tracemalloc.get_traced_memory()
        results['memory_usage'].append(current_memory)
        results['memory_peak'].append(peak_memory)

        results['runtime'].append(t.interval)
        results['disk_read'].append(d.disk_read_bytes)
        results['disk_write'].append(d.disk_write_bytes)

        if true_labels is not None:
            previous_runs_labels.append(labels)

            metric_results = {}

            # Calculate clustering performance metrics
            for metric_name, metric_function in metric_function_map.items():
                try:
                    value = metric_function(dataset, labels) if metric_name in {'calinski_harabasz', 'davies_bouldin_score', 'silhouette_score', 'cp', 'sp', 'dvi'} else metric_function(true_labels.flatten(), labels.flatten()) if metric_name in {'ari', 'nmi'} else metric_function(dataset, labels.reshape(-1, 1))
                    metric_results[metric_name] = value
                except ValueError as e:
                    print(f"Error calculating {metric_name}: {e}")
                    metric_results[metric_name] = np.nan

            # Add the current run's results to the list of previous runs
            previous_runs_results.append(metric_results)

    # Stop memory monitoring after the loop
    tracemalloc.stop()

    # Aggregate the results over all runs
    for metric_name in metric_function_map.keys():
        metric_values = [run_result[metric_name] for run_result in previous_runs_results]
        results[f"{metric_name}_mean"].append(np.mean(metric_values))
        results[f"{metric_name}_std"].append(np.std(metric_values))
        results[f"{metric_name}_max"].append(np.max(metric_values))
        results[f"{metric_name}_min"].append(np.min(metric_values))

    # Calculate min, max, mean, and std of memory usage over all runs
    memory_usages = np.array(results['memory_usage'])
    memory_stats = {
        'memory_min': np.min(memory_usages),
        'memory_max': np.max(memory_usages),
        'memory_mean': np.mean(memory_usages),
        'memory_std': np.std(memory_usages)
    }

    # Save memory stats
    for stat_name, stat_value in memory_stats.items():
        results[stat_name].append(stat_value)
    results['runtime_mean'] = np.mean(results['runtime'])
    results['runtime_std'] = np.std(results['runtime'])
    results['runtime_max'] = np.max(results['runtime'])
    results['runtime_min'] = np.min(results['runtime'])

    return dict(results)

In [None]:
#algorithm 1: weighted consensus clustering 

from sklearn.base import BaseEstimator, ClusterMixin
import numpy as np
import itertools
from sklearn.cluster import SpectralClustering

class ConsensusClustering(BaseEstimator, ClusterMixin):
    def __init__(self, n_clusters, model_funcs, weights=None, sampling_rate=0.8, random_state=None):
        self.n_clusters = n_clusters
        self.model_funcs = model_funcs
        self.weights = weights
        self.sampling_rate = sampling_rate
        self.random_state = random_state

    def fit(self, X, y=None):
        n_samples = X.shape[0]
        n_models = len(self.model_funcs)

        if self.weights is None:
            self.weights = np.ones(n_models)

        if len(self.weights) != n_models:
            raise ValueError("The number of weights must be equal to the number of models.")

        self.weights = self.weights / np.sum(self.weights)

        co_association_matrix = np.zeros((n_samples, n_samples))

        for i in range(n_models):
            model_func = self.model_funcs[i]
            model = model_func(self.n_clusters, self.random_state)
            sampled_indices = np.random.choice(n_samples, int(n_samples * self.sampling_rate), replace=True)
            sampled_data = X[sampled_indices]

            if callable(model):
                labels = model(sampled_data)
            else:
                labels = model.fit_predict(sampled_data)

            for label in range(self.n_clusters):
                cluster_indices = np.where(labels == label)[0]
                original_indices = sampled_indices[cluster_indices]
                pairs = itertools.combinations(original_indices, 2)
                for x, y in pairs:
                    co_association_matrix[x, y] += self.weights[i]
                    co_association_matrix[y, x] += self.weights[i]

        spectral_clustering = SpectralClustering(n_clusters=self.n_clusters, affinity='precomputed')
        self.final_labels_ = spectral_clustering.fit_predict(co_association_matrix)

        return self

    def predict(self, X=None):
        return self.final_labels_

    def fit_predict(self, X, y=None):
        self.fit(X)
        return self.final_labels_
        
#It is called by the following:
consensus_model = ConsensusClustering(n_clusters=3, model_funcs=[KMeans, SpectralClustering])
consensus_model.fit(X)
labels = consensus_model.predict()

In [None]:


# Mount Google Drive
drive.mount('/content/gdrive', force_remount=False)

# Set the path to the 'Masters_data' folder in your Google Drive
base_path = '/content/gdrive/MyDrive/'
folder_name = 'Masters_data'
folder_path = os.path.join(base_path, folder_name)

# Define the models
models = {
    'KMeans': {
        'id': 1,
        'function': KMeans(n_clusters=3)
    },
    'DBSCAN': {
        'id': 2,
        'function': DBSCAN(eps=0.5)
    },
    'SpectralClustering': {
        'id': 3,
        'function': SpectralClustering(n_clusters=3)
    },
    'GaussianMixture': {
        'id': 4,
        'function': GaussianMixture(n_components=3)
    }
}

# Find all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Loop through all CSV files
for csv_file in csv_files:
    # Use the CSV name as the ID of the dataset
    dataset_id = os.path.splitext(os.path.basename(csv_file))[0]

    # Load the dataset
    df = pd.read_csv(csv_file)

    # Extract features and true labels
    X = df.drop(columns='label').values
    y_true = df['label'].values

    # Loop through each model
    for model_name, model_info in models.items():
        model = model_info['function']

        # Evaluate the model on the dataset
        results = evaluate_clustering(algorithm=model, dataset=X, true_labels=y_true, n_runs=10)

        # Add dataset and model information to the results
        results['dataset_id'] = dataset_id
        results['model_id'] = model_info['id']
        results['model'] = model_name

        # Write the results to a new file
        results_file_path = os.path.join(folder_path, f'results_{dataset_id}_{model_info["id"]}.json')
        with open(results_file_path, 'w') as f:
            json.dump(results, f)

        # Append the results to the final_evaluation_results file
        final_results_file_path = os.path.join(folder_path, 'final_evaluation_results.json')
        if os.path.exists(final_results_file_path):
            with open(final_results_file_path, 'r') as f:
                final_results = json.load(f)
        else:
            final_results = []

        final_results.append(results)

        with open(final_results_file_path, 'w') as f:
            json.dump(final_results, f)
