In [3]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, rand_score, homogeneity_score, fowlkes_mallows_score, adjusted_mutual_info_score
from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

In [4]:
data='trotr'
data_path = f'C:/Users/Francesco Periti/Downloads/{data}/{data}'
gold_path = 'C:/Users/Francesco Periti/Downloads/data/my_clusters'

# list of targets
targets = sorted(os.listdir(gold_path))

# wrapper for our stats dataframe
stats = list()
for clustering in sorted(os.listdir(data_path)):
    if clustering == 'data': continue

    # metrics
    metrics_dict = defaultdict(list)
    for i, target in enumerate(targets):
        # clustering result
        df_clustering = pd.read_csv(f'{data_path}/{clustering}/clusters/{target}', sep='\t').sort_values('identifier')
        
        # gold label
        df_gold = pd.read_csv(f'{gold_path}/{target}', sep='\t').sort_values('identifier')

        # select only annotated target usages
        df_clustering = df_clustering[df_clustering.identifier.isin(df_gold.identifier.values)]

        # compute metrics
        metrics_dict['adjusted_rand_index'].append(adjusted_rand_score(df_gold.cluster.values, df_clustering.cluster.values))
        metrics_dict['rand_index'].append(rand_score(df_gold.cluster.values, df_clustering.cluster.values))
        metrics_dict['normalized_mutual_info'].append(normalized_mutual_info_score(df_gold.cluster.values, df_clustering.cluster.values))
        metrics_dict['adjusted_mutual_info'].append(adjusted_mutual_info_score(df_gold.cluster.values, df_clustering.cluster.values))
        metrics_dict['purity'].append(purity_score(df_gold.cluster.values, df_clustering.cluster.values))
        metrics_dict['homogeneity'].append(homogeneity_score(df_gold.cluster.values, df_clustering.cluster.values))
        metrics_dict['fowlkes_mallows'].append(fowlkes_mallows_score(df_gold.cluster.values, df_clustering.cluster.values))

    # store metrics
    record = dict(clustering=clustering)
    record.update({m: np.mean(metrics_dict[m]) for m in metrics_dict})
    stats.append(record)

# convert metrics to dataframe
df = pd.DataFrame(stats)

# store dataframe
df.sort_values('adjusted_rand_index', ascending=False).to_csv('C:/Users/Francesco Periti/Downloads/clustering_stats.tsv', sep='\t')

In [5]:
df

Unnamed: 0,clustering,adjusted_rand_index,rand_index,normalized_mutual_info,adjusted_mutual_info,purity,homogeneity,fowlkes_mallows
0,correlation_1.9,0.26365,0.595609,0.371145,0.287476,0.676156,0.339871,0.644547
1,correlation_2.0,0.223357,0.596789,0.378797,0.2481,0.69264,0.378455,0.596755
2,correlation_2.1,0.202401,0.599575,0.378439,0.229021,0.706215,0.393654,0.569488
3,correlation_2.2,0.224995,0.62209,0.403331,0.243981,0.743989,0.438517,0.577642
4,correlation_2.3,0.163397,0.625613,0.367742,0.205053,0.762692,0.500007,0.564096
5,correlation_2.4,0.162383,0.639737,0.386308,0.20798,0.774155,0.51995,0.54108
6,correlation_2.5,0.194044,0.665042,0.431451,0.237576,0.802863,0.580874,0.55201
7,correlation_2.6,0.180819,0.656635,0.437961,0.232715,0.812516,0.599775,0.526759
8,correlation_2.7,0.168893,0.657789,0.42906,0.218893,0.808243,0.592052,0.510305
9,correlation_2.8,0.15561,0.655304,0.414364,0.195149,0.803718,0.584013,0.490508
