Creating datafile for Gelphi network:
* nodes = sources
* edges = weighted by number of shared chants between such sources - intersection and Jaccard options

In [57]:
import pandas as pd

# Reading csv files
responsories_all = pd.read_csv('all-ci-responsories.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('all-ci-antiphons.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('source.csv', usecols=['id', 'provenance_id', 'drupal_path'])
feasts = pd.read_csv('feast.csv', usecols=['id', 'name'])

# Add sources to data 
respo_with_sources = pd.merge(responsories_all, sources, how='inner', left_on='source_id', right_on='drupal_path')
antipho_with_sources = pd.merge(antiphons_all, sources, how='inner', left_on='source_id', right_on='id')

# Merge responsories and antiphons
data = pd.concat([respo_with_sources, antipho_with_sources])

In [58]:
# Filter sources to avoid working with fragments etc
freq_of_sources = data['drupal_path'].value_counts()
bigger_sources = freq_of_sources.drop(freq_of_sources[freq_of_sources.values < 100].index).index.tolist()
sources_f = sources[sources['drupal_path'].isin(bigger_sources)]

In [59]:
# Source translate to int for smooth matrix indexing 
from collections import OrderedDict

source_dict = OrderedDict()
i = 0
for id in sources_f['drupal_path']:
    source_dict[id] = i
    i += 1
    

def translate_source(source_id):
    return source_dict[source_id]

In [60]:
def intersection_size(a, b):
    return len(set(a) & set(b))

In [61]:
def Jaccard_metrics(a, b):
    return (set(a) & set(b)) / (set(a) | set(b))

In [62]:
# Get datafile for building network in Gelphi
import numpy as np
def get_data_for_one_feast(feast_id, compare_func):
    source_chants_dict = {}
    filt_feast = data['feast_id'] == feast_id
    chants_of_feast = data[filt_feast]
    for source_id in sources_f['drupal_path']:
        filt_source = chants_of_feast['drupal_path'] == source_id
        source_chants_dict[source_id] = (chants_of_feast[filt_source]['cantus_id']).tolist()
    
    all_pairs = [(a, b) for idx, a in enumerate(sources_f['drupal_path']) for b in sources_f['drupal_path'].tolist()[idx + 1:]]
    
    data_matrix = np.zeros([len(sources_f), len(sources_f)])
    for s_i, s_j in all_pairs:
        data_matrix[translate_source(s_i), translate_source(s_j)] = compare_func(source_chants_dict[s_i], source_chants_dict[s_j])
    
    return data_matrix
    

In [63]:
def get_data_or_more_feasts(ids_list):
    pass

In [66]:
def build_data_csv(data_matrix):
    with open('source_feast_gephi.csv', 'w') as f:
        f.write(';'+';'.join(sources_f['drupal_path'].tolist())+'\n')
        for source_id in sources_f['drupal_path']:
            f.write(source_id)
            for i in range(len(sources_f)):
                f.write(';'+str(data_matrix[translate_source(source_id), i]))
            f.write('\n')

In [67]:
build_data_csv(get_data_for_one_feast("feast_0438", intersection_size))
#build_data_csv(get_data_for_one_feast("feast_0357", Jaccard_metrics))