Notebook for experiments with Spectral clustering algorithm


In [1]:
# Imports
import numpy as np
import pandas as pd
from sklearn.cluster import SpectralClustering
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, jaccard_score, hamming_loss
import matplotlib.pyplot as plt

In [3]:
# Read data
responsories_all = pd.read_csv('../data/all-ci-responsories.csv', usecols=['cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('../data/all-ci-antiphons.csv', usecols=['cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('../data/sources-with-provenance-ids-and-two-centuries.csv', usecols=['provenance_id', 'drupal_path'])
feasts = pd.read_csv('../data/feast.csv', usecols=['id', 'name'])

data = pd.concat([responsories_all, antiphons_all])

In [4]:
# Metrics for measuring similarity of two sets ('chant sharingness')
def intersection_size(a : list, b : list):
    '''
    Function returns size of intersection of two sets
    '''
    return len(set(a).intersection(set(b)))

def Jaccard_metrics(a : list, b : list):
    '''
    Function returns value of Jaccard metrics applied on two sets
    '''
    if len(set(a).union(set(b))) != 0:
        return (len(set(a).intersection(set(b))) / len(set(a).union(set(b))))
    else:
        return 0

In [5]:
# Source translate to int for smooth matrix indexing 
from collections import OrderedDict

source_dict = OrderedDict()
i = 0
for id in sources['drupal_path']:
    source_dict[id] = i
    i += 1
    

def translate_source(source_id):
    return source_dict[source_id]

In [6]:
def get_closeness_matrix_all(compare_func, sources):
    source_chants_dict = {}
    chants = data

    for source_id in sources:
        filt_source = chants['source_id'] == source_id
        source_chants_dict[source_id] = (chants[filt_source]['cantus_id']).tolist()
    
    closeness_matrix = np.zeros([len(sources), len(sources)])
    for s_i in sources:
        for s_j in sources:
            closeness_matrix[source_dict[s_i], source_dict[s_j]] = compare_func(source_chants_dict[s_i], source_chants_dict[s_j])
    
    return closeness_matrix

In [7]:
dist_mat = get_closeness_matrix_all(Jaccard_metrics, sources['drupal_path'].tolist()) #['http://cantus.sk/source/14828','http://cantus.sk/source/14018', 'http://cantus.uwaterloo.ca/source/123730','http://cantus.uwaterloo.ca/source/123727', 'http://cantus.uwaterloo.ca/source/649450', 'http://cantus.uwaterloo.ca/source/123595', 'http://cantus.uwaterloo.ca/source/123606', 'http://cantus.uwaterloo.ca/source/123750', 'http://cantus.uwaterloo.ca/source/123718'])
print(dist_mat)
print('sym', (dist_mat==dist_mat.T).all())

[[1.         0.25943971 0.24829739 ... 0.32778076 0.         0.02472527]
 [0.25943971 1.         0.68959869 ... 0.35355781 0.00933707 0.04789272]
 [0.24829739 0.68959869 1.         ... 0.339254   0.         0.04142186]
 ...
 [0.32778076 0.35355781 0.339254   ... 1.         0.         0.00185874]
 [0.         0.00933707 0.         ... 0.         1.         0.10516605]
 [0.02472527 0.04789272 0.04142186 ... 0.00185874 0.10516605 1.        ]]
sym True


In [8]:
CLUSTERS_OPTIONS = [1, 2, 3, 4, 5, 6, 7]

In [9]:
sources = sources['drupal_path'].tolist()
community_variants = []
clustering_scores = {}
for n_clusters in CLUSTERS_OPTIONS:
    clustering_scores[n_clusters] = []
    for _ in range(20):
        distance_matrix = get_closeness_matrix_all(Jaccard_metrics, sources)
        clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='cluster_qr')
        clustering.fit(distance_matrix)
        community_variants.append(clustering.labels_)
    rand = []
    jaccard = []
    mutual_info = []
    hamming = []
    all_pairs = [(a, b) for idx, a in enumerate(community_variants) for b in community_variants[idx + 1:]]
    for pair in all_pairs:
        rand.append(adjusted_rand_score(pair[0], pair[1]))
        jaccard.append(jaccard_score(pair[0], pair[1], average='micro'))
        mutual_info.append(adjusted_mutual_info_score(pair[0], pair[1]))
        hamming.append(hamming_loss(pair[0], pair[1]))
    clustering_scores[n_clusters].append((np.mean(rand), np.mean(jaccard), np.mean(mutual_info), np.mean(hamming)))

In [11]:
for i in CLUSTERS_OPTIONS:
    print(i)
    print('mean jaccard', np.mean([x[1] for x in clustering_scores[i]]))
    print('mean rand', np.mean([x[0] for x in clustering_scores[i]]))
    print('mean mutual', np.mean([x[2] for x in clustering_scores[i]]))
    print('mean hamming', np.mean([x[3] for x in clustering_scores[i]]))

1
mean jaccard 1.0
mean rand 1.0
mean mutual 1.0
mean hamming 0.0
2
mean jaccard 0.5061728395061729
mean rand 0.48717948717948717
mean mutual 0.48717948717948717
mean hamming 0.4761904761904762
3
mean jaccard 0.33609235293807815
mean rand 0.48147228277650606
mean mutual 0.4856667303127618
mean hamming 0.6507263922518159
4
mean jaccard 0.30297529637618
mean rand 0.38499740201090954
mean mutual 0.43603317406527464
mean hamming 0.6662522603978299
5
mean jaccard 0.26472256254563314
mean rand 0.3368402242855735
mean mutual 0.4082884204814024
mean hamming 0.6933621933621933
6
mean jaccard 0.22204849209141383
mean rand 0.32149350217226647
mean mutual 0.40891707520931136
mean hamming 0.7385454181672669
7
mean jaccard 0.20871410491488873
mean rand 0.3077762718996142
mean mutual 0.4086648988512147
mean hamming 0.7478710908823961
