In [1]:
import pandas as pd

# Reading csv files
responsories_all = pd.read_csv('all-ci-responsories.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})
antiphons_all = pd.read_csv('all-ci-antiphons.csv', usecols=['id', 'cantus_id', 'siglum', 'source_id', 'feast_id'], dtype={'cantus_id':"str"})

sources = pd.read_csv('source.csv', usecols=['id', 'provenance_id', 'drupal_path'])
feasts = pd.read_csv('feast.csv', usecols=['id', 'name'])

# Do I want anthiphones and responsories = together staci

In [2]:
# Add sources to data 
respo_with_sources = pd.merge(responsories_all, sources, how='inner', left_on='source_id', right_on='drupal_path')
antipho_with_sources = pd.merge(antiphons_all, sources, how='inner', left_on='source_id', right_on='id')

In [3]:
# Add feasts to data
respo_with_all = pd.merge(respo_with_sources, feasts, how='inner', left_on='feast_id', right_on='id')
antipho_with_all = pd.merge(antipho_with_sources, feasts, how='inner', left_on='feast_id', right_on='id')

In [4]:
# Merge responsories and antiphons
data = pd.concat([respo_with_all, antipho_with_all])

In [5]:
# Filter sources to avoid working with fragments etc
freq_of_sources = data['drupal_path'].value_counts()
bigger_sources = freq_of_sources.drop(freq_of_sources[freq_of_sources.values < 100].index).index.tolist()
sources_f = sources[sources['drupal_path'].isin(bigger_sources)]

In [6]:
# Get data without small sources
data_f = data[data['drupal_path'].isin(sources_f['drupal_path'])]

In [7]:
# Dicts for feast and source ids
from collections import OrderedDict
source_dict = OrderedDict()
i = 0
for id in sources_f['drupal_path']:
    source_dict[id] = i
    i += 1

feast_dict = OrderedDict()
j = 0
for id in feasts['id']:
    feast_dict[id] = j
    j += 1

In [8]:
def translate_feast(feast_id):
    return feast_dict[feast_id] 

def translate_source(source_id):
    return source_dict[source_id]

In [9]:
import numpy as np
# Make matrix sources vs feast
complete_chart = np.zeros([len(sources_f), len(feasts)])
# Fill the chart (aka Bitmapa)
for row in data_f.index:
    complete_chart[translate_source(data_f['drupal_path'][row]), translate_feast(data_f['feast_id'][row])] = 1

In [10]:
# Take out (and store) feasts, that are in all sources
sums = np.sum(complete_chart, axis = 0)
num_of_sources = len(sources_f)

# List of feasts shared among all sources
shared = [i for i in range(len(sums)) if sums[i] == num_of_sources]

print("completely shared feasts:", shared)   # NO SHARED FEASTS???
print("max number of sharings:", max(sums))
print(num_of_sources)

completely shared feasts: []
max number of sharings: 93.0
131


From what is above his it seems that there are not shared feasts among all sources.  
Most shared feast is in 99 sources out of 640.
(After filtering of sources - we preserve once with more than 100 chants, it is max 94 sharings out of 131.)

Piece of data, that may reveal possible future "troubles":  
feast_0229,Comm. plurimorum Martyrum in vigilia,Eve of several Martyrs,,,,12003010,  
feast_0230,Comm. plurimorum Martyrum in vigilia,Eve of several Martyrs,,,,12003010,  
feast_0231,Comm. plurimorum Martyrum in vigilia,Eve of several Martyrs,,,,12003010,

In [11]:
# Histogram
# Percnetage of feasts where source is present

# Create dict for creating df
feast_histo_dict = {}
for i in range(len(feasts)):
    feast_histo_dict[feasts['id'][i]] = [feasts['name'][i], round((sums[i] / num_of_sources) * 100)]

feast_histo = pd.DataFrame.from_dict(feast_histo_dict, orient='index', columns=['feast_name', 'percentage'])
feast_histo.to_csv('feast_histo.csv')

In [12]:
# Histogram
# Percnetage of feasts where source is present - more important (over 10 percent) only

# Create dict for creating df
feast_histo_dict = {}
for i in range(len(feasts)):
    if (sums[i] / num_of_sources) * 100 > 10:
        feast_histo_dict[feasts['id'][i]] = [feasts['name'][i], (sums[i] / num_of_sources) * 100]

feast_histo = pd.DataFrame.from_dict(feast_histo_dict, orient='index', columns=['feast_name', 'percentage'])
feast_histo.to_csv('more_feast_histo.csv')

In [13]:
# Export chart as csv for visualisation
df = pd.DataFrame(complete_chart)
df.to_csv('raw_chart.csv')