Data for Flourish  
Lets create 'source - feast matrix' and 'sorurce - feast network' in format that suits Flourish visualisation software


In [1]:
import pandas as pd

# Reading csv files
responsories_all = pd.read_csv('all-ci-responsories.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('all-ci-antiphons.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('source.csv', usecols=['id', 'title', 'provenance_id', 'drupal_path'])
feasts = pd.read_csv('feast.csv', usecols=['id', 'name'])

# Add sources to data 
respo_with_sources = pd.merge(responsories_all, sources, how='inner', left_on='source_id', right_on='drupal_path')
antipho_with_sources = pd.merge(antiphons_all, sources, how='inner', left_on='source_id', right_on='id')

# Merge responsories and antiphons
data = pd.concat([respo_with_sources, antipho_with_sources])

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Filter sources to avoid working with fragments etc
freq_of_sources = data['drupal_path'].value_counts()
bigger_sources = freq_of_sources.drop(freq_of_sources[freq_of_sources.values < 100].index).index.tolist()
sources_f = sources[sources['drupal_path'].isin(bigger_sources)]

In [None]:
# Filter feasts from filtered sources
freq_of_feasts = data['feast_id'].value_counts()
print("number of all feasts in bigger sources:", len(freq_of_feasts))
bigger_feasts = freq_of_feasts.drop(freq_of_feasts[freq_of_feasts.values < 10].index).index.tolist()
print("number of all bigger feasts in bigger sources:", len(bigger_sources))
feasts_f = feasts[feasts['id'].isin(bigger_feasts)]

number of all feasts in bigger sources: 911
number of all bigger feasts in bigger sources: 131


In [None]:
# Get data without small sources and feasts
data_f = data[data['drupal_path'].isin(sources_f['drupal_path'])]
data_f = data_f[data_f['feast_id'].isin(feasts_f['id'])]

In [None]:
# Dicts for feast and source ids
from collections import OrderedDict
source_dict = OrderedDict()
i = 0
for id in sources_f['drupal_path']:
    source_dict[id] = i
    i += 1

feast_dict = OrderedDict()
j = 0
for id in feasts_f['id']:
    feast_dict[id] = j
    j += 1

In [None]:
# Functions translating feast_ids to indexis
def translate_feast(feast_id):
    return feast_dict[feast_id] 

def translate_source(source_id):
    return source_dict[source_id]

In [None]:
import numpy as np

# Make matrix sources vs feast
complete_chart = np.zeros([len(sources_f), len(feasts_f)])
# Fill the chart (aka Bitmapa)
for row in data_f.index:
    complete_chart[translate_source(data_f['drupal_path'][row]), translate_feast(data_f['feast_id'][row])] = 1

In [None]:
# Get ready pandas data frame of 'source - feast' matrix
import itertools

len_f = len(feasts_f)
s_column = list(itertools.chain.from_iterable([len_f * [s] for s in sources_f['title'].tolist()]))
f_column = len(sources_f) * feasts_f['name'].tolist()

presence_column = list(itertools.chain.from_iterable([row for row in complete_chart]))

chart_source_feast = pd.DataFrame({'sources' : s_column,
                                   'feasts' : f_column,
                                   'presence': presence_column})


In [None]:
# Get desired csv of 'source - feast' matrix
chart_source_feast.to_csv('flourish_source_feast_chart.csv')