In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering

E = np.load("response_embeddings.npy")

E_grouped = E.reshape(375,20,384)

# write a function that computes and stores the cosine similarity matrices in an np array

def compute_cosine_matrices(e):
    return e @ e.transpose(0, 2, 1)

cosine_mats = compute_cosine_matrices(E_grouped)    

In [2]:
# write a function that computes the clusters for the given response ensembles

def cluster_gen(e, tau=0.85):
    n_questions = e.shape[0]
    clusters = []

    for i in range(n_questions):
        embeddings = e[i]
        clustering = AgglomerativeClustering(
            metric = "cosine",
            linkage = "average",
            distance_threshold = 1-tau,
            n_clusters = None
        )
        labels = clustering.fit_predict(embeddings)
        clusters.append(labels)
    return clusters

# now compute and store the entropy

def semantic_entropy(labels):
    counts = np.bincount(labels)
    probs = counts / counts.sum()
    return -(probs * np.log2(probs)).sum()

clusters = cluster_gen(E_grouped)
H_sem = np.array([semantic_entropy(c) for c in clusters])

In [3]:
# compute the centroid cosine dispersion

def centroid_cosine_dispersion(e):
    n_questions = e.shape[0]
    dispersions = []

    for i in range(n_questions):
        embeddings = e[i]
        centroid = embeddings.mean(axis=0)
        centroid = centroid / np.linalg.norm(centroid)
        D_cos = np.mean (1 - (embeddings @ centroid))
        dispersions.append(D_cos)
    return np.array(dispersions)

dispersions = centroid_cosine_dispersion(E_grouped)

In [26]:
# now compute mean mahalnobis distance
# first fit the reference distribution

from sklearn.covariance import LedoitWolf

mask_correct = (df["label"] == 0).to_numpy() 

E_ref = E[mask_correct]                  

mu_ref = E_ref.mean(axis=0)
lw = LedoitWolf().fit(E_ref)
Sigma_inv = lw.precision_

def mean_mahalanobis(e, mu):
    diffs = e - mu
    qf = np.einsum("nkd,dd,nkd->nk", diffs, Sigma_inv, diffs)
    return np.mean(np.sqrt(qf), axis=1)

Mbar = mean_mahalanobis(E_grouped, mu_ref)

In [28]:
# now do the cluster counts and similarity variance

def cluster_count(clusters):
    return np.array([len(np.unique(c)) for c in clusters])

K = cluster_count(clusters)


def similarity_variance(cosine_mats):
    n_q = cosine_mats.shape[0]
    iu = np.triu_indices(cosine_mats.shape[1], k=1) 
    out = np.empty(n_q)

    for i in range(n_q):
        vals = cosine_mats[i][iu]
        out[i] = np.var(vals)

    return out

S_var = similarity_variance(cosine_mats)

In [33]:
# finally, construct the feature matrix X and save to file

X = np.column_stack([H_sem, dispersions, Mbar, K, S_var])

np.save("X.npy", X)