In [1]:
import pandas as pd
import numpy as np
import math

In [11]:
# reading output
# t = ground truth
# c = clusters from algo
filepath = './data/sample1.txt'
df = pd.read_csv(filepath, sep=' ', header=None, names=['y', 'y_hat'])
t = df['y'].tolist()
c = df['y_hat'].tolist()

df.head()

Unnamed: 0,y,y_hat
0,2,1
1,0,2
2,2,0
3,1,1
4,2,2


In [12]:
# calculating nmi

def getClustering(cluster_list):
    all_clusters = {}
    for obj, cluster in enumerate(cluster_list):
        if cluster in all_clusters:
            all_clusters[cluster].append(obj)
        else:
            all_clusters[cluster] = [obj]
    return all_clusters

def marginalProb(clustering, n):
    # finding probabilities of each cluster
    marginal_prob = []
    cluster_nums = list(clustering.keys())
    cluster_nums.sort()
    for cluster in cluster_nums:
        ni = len(clustering[cluster])
        marginal_prob.append(ni/n)
    return marginal_prob

def entropy(marginal_prob):
    log_probs = [math.log(x) for x in marginal_prob]
    entropy_ea_cluster = [a*b for a,b in zip(marginal_prob, log_probs)]
    return -sum(entropy_ea_cluster)
    
n = len(c)
c_clustering = getClustering(c)
t_clustering = getClustering(t)
c_mprob = marginalProb(c_clustering,n)
t_mprob = marginalProb(t_clustering,n)

# calculating mutual info
mi_ea_cluster = []
nijs = []
for i in c_clustering.keys():
    for j in t_clustering.keys():
        nij = len(list(set(c_clustering[i]) & set(t_clustering[j])))
        nijs.append(nij)
        pij = nij/n
        if pij != 0:
            mi_ea_cluster.append(pij * math.log(pij / (c_mprob[i]*t_mprob[j])))
mutualInfo = sum(mi_ea_cluster)

nmi = mutualInfo / math.sqrt(entropy(c_mprob) * entropy(t_mprob))

nmi

0.8488486176845911

In [13]:
# calculating jaccard coeff
nij_sq = [x**2 for x in nijs]
tp = 1/2 * (sum(nij_sq) - n)

fn = []
for cluster in t_clustering.keys():
    fn.append(math.comb(len(t_clustering[cluster]), 2))
fn = sum(fn) - tp

fp = []
for cluster in c_clustering.keys():
    fp.append(math.comb(len(c_clustering[cluster]), 2))
fp = sum(fp) - tp

tn = n-(tp+fn+fp)

jaccard = tp/(tp+fn+fp)
jaccard

0.8769590495449949

In [14]:
# printing output - rounded to 3 decimal places
# the two numbers should be separated by a space
# NMI Jaccard
print(format(nmi, ".3f"), format(jaccard, ".3f"))

0.849 0.877
