In [48]:
# Data manipulation and computation
import numpy as np
import pandas as pd
import h5py
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, pairwise_distances, v_measure_score
from sknetwork.clustering import Louvain
from scipy import sparse
import matplotlib.pyplot as plt
import seaborn as sns

# Define paths
pge_path = 'colon_nct_feature/pge_dim_reduced_feature.h5'
vgg16_path = 'colon_nct_feature/vgg16_dim_reduced_feature.h5'

# Load and normalize data
def load_and_normalize_data(path, key):
    with h5py.File(path, 'r') as h5_file:
        data = h5_file[key][...]
        return MinMaxScaler().fit_transform(data)

datasets = {
    'pge_pca': load_and_normalize_data(pge_path, 'pca_feature'),
    'pge_umap': load_and_normalize_data(pge_path, 'umap_feature'),
    'vgg16_pca': load_and_normalize_data(vgg16_path, 'pca_feature'),
    'vgg16_umap': load_and_normalize_data(vgg16_path, 'umap_feature'),
}

In [49]:
from sklearn.metrics import euclidean_distances

def create_similarity_matrix(data):
    distances = pairwise_distances(data)
    max_distance = np.max(distances)
    similarities = np.exp(-distances ** 2 / (2. * max_distance ** 2))
    return sparse.csr_matrix(similarities)

def create_adjacency_matrix(data):
    adjacency_matrix = sparse.csr_matrix(-pairwise_distances(data))
    return adjacency_matrix

def extract_labels(h5_content):
    filename = np.squeeze(h5_content['file_name'])
    filename = np.array([str(x) for x in filename])
    return np.array([x.split('/')[2] for x in filename])

# Extracting labels for each dataset
pge_labels = extract_labels(h5py.File(pge_path, mode='r'))
vgg16_labels = extract_labels(h5py.File(vgg16_path, mode='r'))

labels_dict = {
    'pge': pge_labels,
    'vgg16': vgg16_labels,
}

In [50]:
# Adjust the resolution values based on your tests for each dataset
resolutions = [0.9, 1, 0.8]
modularity_options = ['Dugue', 'Newman', 'Potts']

louvain_results = []

for dataset_name, data in datasets.items():
    base_name = dataset_name.split('_')[0]
    true_labels = labels_dict[base_name] 
    #similarity_matrix = create_adjacency_matrix(data)
    for resolution in resolutions:
        for modularity in modularity_options:
            louvain_model = Louvain(resolution=resolution, modularity=modularity, random_state=0)
            adjacency_matrix = create_similarity_matrix(data)
            labels = louvain_model.fit_predict(adjacency_matrix)

            num_clusters = len(np.unique(labels))
            print(f"{dataset_name} - Resolution: {resolution}, Modularity: {modularity}, Clusters: {num_clusters}")

            # Handling extreme cases
            if num_clusters == 1 or num_clusters == len(data):
                silhouette = None
                v_measure = None
            else:
                silhouette = silhouette_score(data, labels)
                v_measure = v_measure_score(true_labels, labels)

            louvain_results.append({
                'dataset': dataset_name,
                'resolution': resolution,
                'modularity': modularity,
                'silhouette_score': silhouette,
                'v_measure_score': v_measure,
                'num_clusters': num_clusters
            })

results_df = pd.DataFrame(louvain_results)
results_df.to_csv('results_df.csv', index=False)
print(results_df)

pge_pca - Resolution: 0.9, Modularity: Dugue, Clusters: 1
pge_pca - Resolution: 0.9, Modularity: Newman, Clusters: 1
pge_pca - Resolution: 0.9, Modularity: Potts, Clusters: 11
pge_pca - Resolution: 1, Modularity: Dugue, Clusters: 4
pge_pca - Resolution: 1, Modularity: Newman, Clusters: 4
pge_pca - Resolution: 1, Modularity: Potts, Clusters: 742
pge_pca - Resolution: 0.8, Modularity: Dugue, Clusters: 1
pge_pca - Resolution: 0.8, Modularity: Newman, Clusters: 1
pge_pca - Resolution: 0.8, Modularity: Potts, Clusters: 1
pge_umap - Resolution: 0.9, Modularity: Dugue, Clusters: 1
pge_umap - Resolution: 0.9, Modularity: Newman, Clusters: 1


KeyboardInterrupt: 