In [None]:

datasets = {
    "standardScaler-KNN", 
    'minmaxScaler-KNN', 
    'robustScaler-KNN', 
    'maxabsScaler-KNN',
    # 'yeoJohnsonScaler-KNN',
    'unitVectorScaler-KNN',
    'mquantileTransformScaler-KNN',
    "standardScaler-softImputerRegressor", 
    'minmaxScaler-softImputerRegressor', 
    'robustScaler-softImputerRegressor', 
    'maxabsScaler-softImputerRegressor',
    # 'yeoJohnsonScaler-softImputerRegressor',
    'unitVectorScaler-softImputerRegressor',
    'mquantileTransformScaler-softImputerRegressor',
    "standardScaler-iterativeImputerRegressor",
    'minmaxScaler-iterativeImputerRegressor', 
    'robustScaler-iterativeImputerRegressor' , 
    'maxabsScaler-iterativeImputerRegressor',
    # 'yeoJohnsonScaler-iterativeImputerRegressor',
    'unitVectorScaler-iterativeImputerRegressor',
    'mquantileTransformScaler-iterativeImputerRegressor',
    "standardScaler-simpleRegressor", 
    'minmaxScaler-simpleRegressor', 
    'robustScaler-simpleRegressor', 
    'maxabsScaler-simpleRegressor',
    # 'yeoJohnsonScaler-simpleRegressor',
    'unitVectorScaler-simpleRegressor',
    'mquantileTransformScaler-simpleRegressor'
}


# Initialize dimensionality reduction techniques
dim_reduction_techniques = {
    'PCA',
    'TruncatedSVD',
    'FastICA',
    'MDS',
    't-SNE',
    'UMAP',
    'FactorAnalysis',
    'Isomap',
    'LocallyLinearEmbedding'
}

In [None]:
# ML for clustering
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, OPTICS, SpectralClustering, MeanShift, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.mixture import GaussianMixture

# Function to prepend a string to each row
def prepend_string(row, prefix):
    return prefix + str(row)


def kmeans_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    kmeans = KMeans(n_clusters=3, n_init=10)
    kmeans.fit(data)
    labels = kmeans.labels_
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    # Get the cluster centers
    centers = kmeans.cluster_centers_
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def agglomerative_clustering_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    hierarchical = AgglomerativeClustering(n_clusters=3)
    labels = hierarchical.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def DBSCAN_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    dbscan = DBSCAN(eps=0.3, min_samples=5)
    labels = dbscan.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def mean_shift(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    meanshift = MeanShift()
    labels = meanshift.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters
    
def gaussian_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    gmm = GaussianMixture(n_components=3)
    labels = gmm.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def spectral_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    spectral = SpectralClustering(n_clusters=3)
    labels = spectral.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters

def optics_AL(df):
    # Create a copy of the original DataFrame
    data = df.copy()
    optics = OPTICS(min_samples=5, xi=0.05)
    labels = optics.fit_predict(data)
    data['labels'] = labels
    data['labels_tagged'] = data['labels'].apply(prepend_string, args=("Group ",))
    num_clusters = len(np.unique(labels))
    return data, labels, np.unique(labels), num_clusters




def create_dir(directory_path):
    import os
    # Check if the directory exists, and if not, create it
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' created successfully.")
    else:
        print(f"Directory '{directory_path}' already exists.")


In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import random
import altair as alt

# Initialize ML techniques
ML_techniques = {
    'kmeans_AL': 'kmeans_AL',
    'DBSCAN_AL': 'DBSCAN_AL',
    'mean_shift': 'mean_shift',
    'gaussian_AL': 'gaussian_AL',
    'optics_AL': 'optics_AL',
    'agglomerative_clustering_AL': 'agglomerative_clustering_AL',
}

charts = []

for i, dataset in enumerate(datasets):
    for dm in dim_reduction_techniques:
        for technique_name, technique in ML_techniques.items():
            create_dir("cluster_charts/" + dataset+"_"+technique_name+"/charts/")
            create_dir("cluster_datasets/" + dataset+"_"+technique_name+"/datasets/")
            clustered_data, labels, comp, compp = eval(technique)(datasets[dataset])
            print(compp)
            # Define specific colors for each label
            color_tag = ['#93C4F6', '#005EB8', '#D9DE84', '#636B05']
            if(compp > 1):
                color_scale = alt.Scale(domain=np.unique(clustereds_data["labels_tagged"]), range=color_tag)
                clustered_data.to_csv("cluster_datasets/" + dataset+"_"+technique_name+"/datasets/"+technique_name + '_clustering.csv')
                # Plot the data points and cluster centers
                clustered_data
                
                # Evaluate clustering using different metrics
                silhouette = silhouette_score(clustered_data[["PCA1", "PCA2", "labels"]], labels)
                db_index = davies_bouldin_score(clustered_data[["PCA1", "PCA2", "labels"]], labels)
                ch_score = calinski_harabasz_score(clustered_data[["PCA1", "PCA2", "labels"]], labels)
                
                chart = alt.Chart(clustered_data).mark_point().encode(
                    x='PCA1:Q',
                    y='PCA2:Q',
                    color=alt.Color('labels_tagged:N', scale=color_scale, legend=alt.Legend(title="Clusters")),  # Use the defined color scale
                    tooltip=['PCA1', 'PCA2', 'labels_tagged'],  # Add tooltip information
                    # text="{'silhouette':" + str(silhouette) + ", 'davies_bouldin_score':" + str(db_index) + ", 'calinski_harabasz_score':" + str(ch_score) + "}"
                ).properties(title=f'{technique_name} on {dataset.replace("_", " ")}')
                # charts.append(chart)
                # combine_chart = alt.hconcat(*charts)
                # combine_chart
                
                chart.save("cluster_charts/" + dataset+"_"+technique_name+"/charts/"+dataset + 'HIGH.png', engine="vl-convert", ppi=200, format='png')
    
        
                # Print the evaluation results
                print(f"Silhouette Score: {silhouette}")
                print(f"Davies-Bouldin Index: {db_index}")
                print(f"Calinski-Harabasz Index: {ch_score}")
                # print(f"Adjusted Rand Index: {ari_score}")
                # print(f"Normalized Mutual Information: {nmi_score}")