Data for Flourish  
Lets create 'source - feast matrix' and 'sorurce - feast network' in format that suits Flourish visualisation software


In [1]:
import pandas as pd

# Reading csv files
responsories_all = pd.read_csv('all-ci-responsories.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('all-ci-antiphons.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('source.csv', usecols=['id', 'title', 'provenance_id', 'drupal_path'])
feasts = pd.read_csv('feast.csv', usecols=['id', 'name'])

# Add sources to data 
respo_with_sources = pd.merge(responsories_all, sources, how='inner', left_on='source_id', right_on='drupal_path')
antipho_with_sources = pd.merge(antiphons_all, sources, how='inner', left_on='source_id', right_on='id')

# Merge responsories and antiphons
data = pd.concat([respo_with_sources, antipho_with_sources])

In [2]:
# Filter sources to avoid working with fragments etc
freq_of_sources = data['drupal_path'].value_counts()
bigger_sources = freq_of_sources.drop(freq_of_sources[freq_of_sources.values < 100].index).index.tolist()
sources_f = sources[sources['drupal_path'].isin(bigger_sources)]

In [3]:
# Filter feasts from filtered sources
freq_of_feasts = data['feast_id'].value_counts()
print("number of all feasts in bigger sources:", len(freq_of_feasts))
bigger_feasts = freq_of_feasts.drop(freq_of_feasts[freq_of_feasts.values < 10].index).index.tolist()
print("number of all bigger feasts in bigger sources:", len(bigger_sources))
feasts_f = feasts[feasts['id'].isin(bigger_feasts)]

number of all feasts in bigger sources: 911
number of all bigger feasts in bigger sources: 131


In [4]:
# Get data without small sources and feasts
data_f = data[data['drupal_path'].isin(sources_f['drupal_path'])]
data_f = data_f[data_f['feast_id'].isin(feasts_f['id'])]

In [5]:
# Dicts for feast and source ids
from collections import OrderedDict
source_dict = OrderedDict()
i = 0
for id in sources_f['drupal_path']:
    source_dict[id] = i
    i += 1

feast_dict = OrderedDict()
j = 0
for id in feasts_f['id']:
    feast_dict[id] = j
    j += 1

In [6]:
# Functions translating feast_ids to indexis
def translate_feast(feast_id):
    return feast_dict[feast_id] 

def translate_source(source_id):
    return source_dict[source_id]

In [7]:
# Heatmap part
import numpy as np

# Make matrix sources vs feast
complete_chart = np.zeros([len(sources_f), len(feasts_f)])
# Fill the chart (aka Bitmapa)
for row in data_f.index:
    complete_chart[translate_source(data_f['drupal_path'][row]), translate_feast(data_f['feast_id'][row])] = 1

In [8]:
# Get ready pandas data frame of 'source - feast' matrix
import itertools

len_f = len(feasts_f)
s_column = list(itertools.chain.from_iterable([len_f * [s] for s in sources_f['title'].tolist()]))
f_column = len(sources_f) * feasts_f['name'].tolist()

presence_column = list(itertools.chain.from_iterable([row for row in complete_chart]))

chart_source_feast = pd.DataFrame({'sources' : s_column,
                                   'feasts' : f_column,
                                   'presence': presence_column})


In [9]:
# Get desired csv of 'source - feast' matrix
chart_source_feast.to_csv('flourish_source_feast_chart.csv')

In [14]:
# Network part
# We have some feast in common, my dear source friend
len_s = len(sources_f)
s1_id_column = list(itertools.chain.from_iterable([len_s * [s] for s in sources_f['drupal_path'].tolist()]))
s2_id_column = len_s * sources_f['drupal_path'].tolist()
s1_tit_column = list(itertools.chain.from_iterable([len_s * [s] for s in sources_f['title'].tolist()]))
s2_tit_column = len_s * sources_f['title'].tolist()
shared_column = []
for i in range(len(s1_id_column)):
    shared_column.append(sum(
        np.logical_and(complete_chart[translate_source(s1_id_column[i])], complete_chart[translate_source(s2_id_column[i])])))

print(shared_column)
print(len(shared_column), len_s)

chart_shared_feasts_of_sources = pd.DataFrame({'source1' : s1_tit_column,
                                               'source2' : s2_tit_column,
                                               'shared' : shared_column})

[145, 99, 25, 93, 74, 64, 105, 117, 21, 90, 112, 17, 100, 40, 47, 34, 29, 40, 93, 109, 99, 6, 90, 41, 80, 70, 21, 69, 37, 26, 52, 111, 37, 43, 45, 49, 8, 57, 45, 76, 61, 96, 44, 48, 20, 26, 54, 59, 72, 8, 54, 93, 44, 39, 57, 122, 16, 21, 14, 15, 86, 55, 59, 46, 59, 65, 122, 90, 13, 121, 27, 85, 110, 59, 81, 99, 46, 53, 36, 62, 97, 123, 30, 60, 105, 24, 101, 73, 29, 71, 90, 13, 86, 79, 91, 66, 27, 49, 43, 98, 93, 97, 55, 68, 85, 53, 85, 31, 88, 42, 47, 37, 59, 97, 21, 91, 112, 30, 82, 67, 76, 109, 76, 38, 60, 44, 25, 52, 51, 121, 92, 99, 118, 22, 91, 74, 60, 94, 100, 22, 87, 97, 17, 86, 40, 43, 32, 29, 39, 92, 97, 87, 5, 84, 39, 68, 50, 20, 57, 35, 19, 55, 97, 38, 39, 43, 43, 7, 56, 34, 73, 53, 90, 42, 40, 17, 27, 52, 57, 65, 7, 48, 90, 39, 39, 52, 101, 15, 22, 12, 15, 80, 46, 51, 42, 57, 55, 102, 87, 15, 101, 23, 83, 99, 50, 78, 102, 40, 47, 36, 61, 91, 99, 30, 49, 86, 25, 94, 62, 29, 66, 79, 12, 81, 74, 86, 59, 23, 47, 37, 88, 89, 90, 53, 64, 82, 45, 77, 29, 85, 40, 43, 30, 52, 88, 18

In [15]:
chart_shared_feasts_of_sources.to_csv('sources_shared_feasts_flourish.csv')