Creating datafile for Gelphi network:
* nodes = sources
* edges = weighted by number of shared chants between such sources - intersection and Jaccard options

In [78]:
import pandas as pd

# Reading csv files
responsories_all = pd.read_csv('all-ci-responsories.csv', usecols=['cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('all-ci-antiphons.csv', usecols=['cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('sources-with-provenance-ids-and-two-centuries.csv', usecols=['provenance_id', 'drupal_path'])
feasts = pd.read_csv('feast.csv', usecols=['id', 'name'])

# Add sources to data 
#respo_with_sources = pd.merge(responsories_all, sources, how='inner', left_on='source_id')
#antipho_with_sources = pd.merge(antiphons_all, sources, how='inner', left_on='source_id')

# Merge responsories and antiphons
data = pd.concat([responsories_all, antiphons_all])

In [79]:
# Filter sources to avoid working with fragments etc
freq_of_sources = data['source_id'].value_counts()
bigger_sources = freq_of_sources.drop(freq_of_sources[freq_of_sources.values < 100].index).index.tolist()
sources_f = sources[sources['drupal_path'].isin(bigger_sources)]
print(len(data))
print(len(sources_f))

375275
64


In [80]:
# Source translate to int for smooth matrix indexing 
from collections import OrderedDict

source_dict = OrderedDict()
i = 0
for id in sources_f['drupal_path']:
    source_dict[id] = i
    i += 1
    

def translate_source(source_id):
    return source_dict[source_id]

In [81]:
# Metrics for measuring similarity of two sets ('chant sharingness')
def intersection_size(a : list, b : list):
    '''
    Function returns size of intersection of two sets
    '''
    return len(set(a).intersection(set(b)))

def Jaccard_metrics(a : list, b : list):
    '''
    Function returns value of Jaccard metrics applied on two sets
    '''
    if len(set(a) | set(b)) != 0:
        return len(set(a).intersection(set(b))) / len(set(a).union(set(b)))
    else:
        return 0

In [82]:
# Get matrix for building network in Gelphi
import numpy as np

def get_data_for_one_feast(feast_id, compare_func):
    source_chants_dict = {}
    filt_feast = data['feast_id'] == feast_id
    chants_of_feast = data[filt_feast]
    for source_id in sources_f['drupal_path']:
        filt_source = chants_of_feast['source_id'] == source_id
        source_chants_dict[source_id] = (chants_of_feast[filt_source]['cantus_id']).tolist()
    
    all_pairs = [(a, b) for idx, a in enumerate(sources_f['drupal_path']) for b in sources_f['drupal_path'].tolist()[idx + 1:]]
    
    data_matrix = np.zeros([len(sources_f), len(sources_f)])
    for s_i, s_j in all_pairs:
        data_matrix[translate_source(s_i), translate_source(s_j)] = compare_func(source_chants_dict[s_i], source_chants_dict[s_j])
    
    print(data_matrix)
    return data_matrix
    

In [83]:
def get_data_of_more_feasts(ids_list):
    pass

In [84]:
def build_data_csv(data_matrix, feast):
    file = feast + '_source_feast_gephi.csv'
    with open(file, 'w') as f:
        f.write(';'+';'.join(sources_f['drupal_path'].tolist())+'\n')
        for source_id in sources_f['drupal_path']:
            f.write(source_id)
            for i in range(len(sources_f)):
                f.write(';'+str(data_matrix[translate_source(source_id), i]))
            f.write('\n')

In [85]:
build_data_csv(get_data_for_one_feast("feast_1531", intersection_size), "feast_1531")
build_data_csv(get_data_for_one_feast("feast_1531", Jaccard_metrics), "feast_1531")

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
