In [1]:
import h5py
import numpy as np
import random
import matplotlib.pyplot as plt

from scipy import sparse
from sklearn.metrics import silhouette_score, v_measure_score
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sknetwork.clustering import Louvain  # search for scikit-network when trying to install

In [2]:
FEATURE_TYPES = ['pca', 'umap']
REPRESENTATIONS = ['pge', 'resnet50', 'inceptionv3', 'vgg16']

# Load Data Content
pge_path = 'colon_nct_feature/pge_dim_reduced_feature.h5'
resnet50_path = 'colon_nct_feature/resnet50_dim_reduced_feature.h5'
inceptionv3_path = 'colon_nct_feature/inceptionv3_dim_reduced_feature.h5'
vgg16_path = 'colon_nct_feature/vgg16_dim_reduced_feature.h5'

pge_content = h5py.File(pge_path, mode='r')
resnet50_content = h5py.File(resnet50_path, mode='r')
inceptionv3_content = h5py.File(inceptionv3_path, mode='r')
vgg16_content = h5py.File(vgg16_path, mode='r')

data_content = {
    'pge': pge_content,
    'resnet50': resnet50_content,
    'inceptionv3': inceptionv3_content,
    'vgg16': vgg16_content
}

In [3]:
# Feature Extraction
pge_pca_feature = data_content['pge']['pca_feature'][...]
resnet50_pca_feature = data_content['resnet50']['pca_feature'][...]
inceptionv3_pca_feature = data_content['inceptionv3']['pca_feature'][...]
vgg16_pca_feature = data_content['vgg16']['pca_feature'][...]

pge_umap_feature = data_content['pge']['umap_feature'][...]
resnet50_umap_feature = data_content['resnet50']['umap_feature'][...]
inceptionv3_umap_feature = data_content['inceptionv3']['umap_feature'][...]
vgg16_umap_feature = data_content['vgg16']['umap_feature'][...]

In [None]:
def get_labels(self, representation):
        # tissue type as available ground-truth: labels
        filename = np.squeeze(self.data_content[representation]['file_name'])
        filename = np.array([str(x) for x in filename])
        labels = np.array([x.split('/')[2] for x in filename])
        return labels

In [4]:
# Dataset Preparation
datasets = {}
for representation in REPRESENTATIONS:
     for feature_type in FEATURE_TYPES:
        if feature_type == 'pca':
            datasets[representation] = {
                'pca': data_content[representation]['pca_feature'][...],
                'label': data_content[representation]['label'][...]
            }
        elif feature_type == 'umap':
            datasets[representation] = {
                'umap': data_content[representation]['umap_feature'][...],
                'label': data_content[representation]['label'][...]
            }
        else:
          datasets[representation] = {}

KeyError: "Unable to open object (object 'label' doesn't exist)"

In [None]:
# Populate datasets dictionary
# for representation in REPRESENTATIONS

In [None]:
# Labels Extraction
pge_filename = np.squeeze(data_content['pge']['file_name'])
pge_filename = np.array([str(x) for x in pge_filename])
pge_labels = np.array([x.split('/')[2] for x in pge_filename])

resnet50_filename = np.squeeze(data_content['resnet50']['file_name'])
resnet50_filename = np.array([str(x) for x in resnet50_filename])
resnet50_labels = np.array([x.split('/')[2] for x in resnet50_filename])

inceptionv3_filename = np.squeeze(data_content['inceptionv3']['file_name'])
inceptionv3_filename = np.array([str(x) for x in inceptionv3_filename])
inceptionv3_labels = np.array([x.split('/')[2] for x in inceptionv3_filename])

vgg16_filename = np.squeeze(data_content['vgg16']['file_name'])
vgg16_filename = np.array([str(x) for x in vgg16_filename])
vgg16_labels = np.array([x.split('/')[2] for x in vgg16_filename])

In [None]:
# Testing Data
random.seed(0)
selected_index = random.sample(list(np.arange(len(pge_pca_feature))), 200)
pge_test_data = pge_pca_feature[selected_index]
pge_test_label = pge_labels[selected_index]

selected_index = random.sample(list(np.arange(len(resnet50_pca_feature))), 200)
resnet50_test_data = resnet50_pca_feature[selected_index]
resnet50_test_label = resnet50_labels[selected_index]

selected_index = random.sample(list(np.arange(len(inceptionv3_pca_feature))), 200)
inceptionv3_test_data = inceptionv3_pca_feature[selected_index]
inceptionv3_test_label = inceptionv3_labels[selected_index]

selected_index = random.sample(list(np.arange(len(vgg16_pca_feature))), 200)
vgg16_test_data = vgg16_pca_feature[selected_index]
vgg16_test_label = vgg16_labels[selected_index]

In [None]:
# Louvain Clustering
def louvain_clustering(test_data, resolution=0.9, modularity='Newman', random_state=0):
    louvain_model = Louvain(resolution=resolution, modularity=modularity, random_state=random_state)
    adjacency_matrix = sparse.csr_matrix(MinMaxScaler().fit_transform(-pairwise_distances(test_data)))
    louvain_assignment = louvain_model.fit_transform(adjacency_matrix)
    labels = louvain_model.fit_predict(adjacency_matrix)
    # Check if there is at least two clusters
    unique_labels = np.unique(labels)
    if len(unique_labels) < 2:
        print("Skipping clustering as it resulted in less than two clusters.")
        return None, None
    return louvain_assignment, labels

In [None]:
from scipy import sparse
from sklearn.metrics import silhouette_score, v_measure_score
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sknetwork.clustering import Louvain  # search for scikit-network when trying to install

def get_labels(data_content, representation):
    filename = np.squeeze(data_content[representation]['file_name'])
    filename = np.array([str(x) for x in filename])
    labels = np.array([x.split('/')[2] for x in filename])
    return labels



# Clustering each dataset using Louvain
clustered_datasets = {}
for representation in REPRESENTATIONS:
    clustered_datasets[representation] = {}
    for feature_type in FEATURE_TYPES:
        dataset = datasets[representation][feature_type]
        clusters = louvain_clustering(dataset)
        clustered_datasets[representation][feature_type] = clusters
        print(dataset.shape)
        print(clusters)
        print(f"Clustering for {representation} - {feature_type} completed.")


# Evaluation
for representation in REPRESENTATIONS:
    for feature_type in FEATURE_TYPES:
        clusters = clustered_datasets[representation][feature_type]
        dataset = datasets[feature_type][representation]
        
        # Silhouette Score
        silhouette_avg = silhouette_score(dataset, clusters)
        print(f"Silhouette Score for {representation} - {feature_type}: {silhouette_avg}")

        # Vmeasure Score
        labels = get_labels(data_content[representation], representation)
        vmeasure = v_measure_score(labels, clusters)
        print(f"Vmeasure Score for {representation} - {feature_type}: {vmeasure}")

        # Visualization (assuming 2D data for simplicity)
        pca = PCA(n_components=2)
        umap_model = umap.UMAP(n_components=2)
        reduced_data = pca.fit_transform(dataset)
        umap_data = umap_model.fit_transform(dataset)

        plt.figure(figsize=(12, 4))
        plt.subplot(1, 3, 1)
        plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis', s=10)
        plt.title('PCA Clusters')

        plt.subplot(1, 3, 2)
        plt.scatter(umap_data[:, 0], umap_data[:, 1], c=clusters, cmap='viridis', s=10)
        plt.title('UMAP Clusters')

        plt.subplot(1, 3, 3)
        plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis', s=10)
        plt.title('Ground Truth')

        plt.suptitle(f'Clustering Visualization - {representation} - {feature_type}')
        plt.show()

KeyError: 'pca'

In [None]:
import h5py
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, v_measure_score
from sklearn.preprocessing import MinMaxScaler
from sknetwork.clustering import Louvain
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from scipy import sparse

# Load data content
def load_data_content():
    pge_path = 'colon_nct_feature/pge_dim_reduced_feature.h5'
    vgg16_path = 'colon_nct_feature/vgg16_dim_reduced_feature.h5'

    pge_content = h5py.File(pge_path, mode='r')
    vgg16_content = h5py.File(vgg16_path, mode='r')

    return {
        'pge': pge_content,
        'vgg16': vgg16_content
    }

data_content = load_data_content()

def get_labels(data_content, representation):
    filename = np.squeeze(data_content[representation]['file_name'])
    filename = np.array([str(x) for x in filename])
    labels = np.array([x.split('/')[2] for x in filename])
    return labels




# Visualize Clusters
def visualize_clusters(data, clusters, title):
    pca = PCA(n_components=2)
    umap_model = TSNE(n_components=2)
    reduced_data = pca.fit_transform(data)
    umap_data = umap_model.fit_transform(data)

    plt.figure(figsize=(12, 4))
    plt.subplot(1, 3, 1)
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis', s=10)
    plt.title('PCA Clusters')

    plt.subplot(1, 3, 2)
    plt.scatter(umap_data[:, 0], umap_data[:, 1], c=clusters, cmap='viridis', s=10)
    plt.title('UMAP Clusters')

    plt.subplot(1, 3, 3)
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis', s=10)
    plt.title('Ground Truth')

    plt.suptitle(title)
    plt.show()

# Analyze Louvain Clustering Performance for 'pge'
pge_data = data_content['pge']['umap_feature'][...]
pge_labels = get_labels(data_content, 'pge')

best_silhouette = -1
best_resolution = 0

for resolution in np.arange(0.9, 1.0, 0.8):
    _, clusters = louvain_clustering(pge_data, resolution=resolution)
    silhouette_avg = silhouette_score(pge_data, clusters)
    #if clusters is None:
     #   continue

    print(f"Resolution: {resolution}, Silhouette Score: {silhouette_avg}")

    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_resolution = resolution

# Visualize the best-performing clustering for 'pge'
best_pge_clusters = louvain_clustering(pge_data, resolution=best_resolution)[1]
visualize_clusters(pge_data, best_pge_clusters, f'Best Louvain Clustering for PGE (Resolution={best_resolution})')

# Analyze Louvain Clustering Performance for 'vgg16'
vgg_data = data_content['vgg16']['umap_feature'][...]
vgg_labels = get_labels(data_content, 'vgg16')

best_silhouette = -1
best_resolution = 0

for resolution in np.arange(0.9, 1.0, 0.8):
    _, clusters = louvain_clustering(vgg_data, resolution=resolution)
    silhouette_avg = silhouette_score(vgg_data, clusters)
    #if clusters is None:
     #   continue

    print(f"Resolution: {resolution}, Silhouette Score: {silhouette_avg}")

    if silhouette_avg > best_silhouette:
        best_silhouette = silhouette_avg
        best_resolution = resolution

# Visualize the best-performing clustering for 'vgg16'
best_vgg_clusters = louvain_clustering(vgg_data, resolution=best_resolution)[1]
visualize_clusters(vgg_data, best_vgg_clusters, f'Best Louvain Clustering for VGG16 (Resolution={best_resolution})')
