# STatistical tools for BiblicalIntertextuality

This jupyter notebook contains functions that allows you to process statistics out of the data corpus (both journals and the Bible) and out of the results file.

In [189]:
import pandas as pd
import json
import os
import biblical_intertextuality_package as bip
from collections import defaultdict
import numpy as np
import joblib
from nltk import word_tokenize

from biblical_intertextuality_package import split_verse

In [190]:
ROOT_PATH = os.getcwd()

BIBLES_PATH = os.path.join(ROOT_PATH, 'Bible_files')
DATASETS_PATH = os.path.join(ROOT_PATH, 'datasets')
DICTS_PATH = os.path.join(ROOT_PATH, 'dictionaries')
CORPUS_PATH = os.path.join(ROOT_PATH, 'corpuses')
ALL_JSONS_PATH = os.path.join(ROOT_PATH, 'query_jsons')

JOURNAL_FULLDATA_PATH = os.path.join(ROOT_PATH, 'journals_fulldata.joblib')

# RESULTS_PATH = os.path.join(ROOT_PATH, 'results')
RESULTS_PATH = os.path.join(ROOT_PATH, 'PUBLIC_RESULTS')
BATCHES_FILE_PATH = os.path.join(ROOT_PATH, 'batches.csv')
BATCH_RESULTS_FILE_PATH = os.path.join(RESULTS_PATH, 'batch_results.csv')

STOP_WORDS_PATH = os.path.join(ROOT_PATH, 'stop_words.txt')
STOP_SUBVERSES_PATH = os.path.join(ROOT_PATH, 'stop_subverses_21.txt')
EXCLUSIVES_PATH = os.path.join(ROOT_PATH, 'exclusives.txt')

EVALUATION_STOP_SUBVERSES_PATH = os.path.join(ROOT_PATH, 'evaluation_stop_subverses_21.txt')
FULL_HIT_NEEDED_SUBS_PATH = os.path.join(ROOT_PATH, '100_hit_needed_subs_21.txt')

## The Bible statistics
Following functions return statistics about Bible files as stored in Bible_files directory. Be aware that we as the developers have an additional translation at our disposal ('Bible Svatováclavská') so your statistics will be probably different.

- number of translations at our disposal: 5
    - one is both Old and New Testament ("Bible Kralická")
    - translation of Jan Hejčl: Old Testament + deuterocanonical books
    - "Bible of the Saint Venceslas": mix of Old and New Testament, not complete (even some individual verses are missing)
    - translation of Ladislav Sýkora: New Testament
    - translation of František Žilka: New Testament

In [191]:
translation_names = {'BKR': 'Bible Kralická', 'BSV': 'Bible Svatováclavská', 'HEJCL': 'Jan Hejčl', 'SYK': 'Ladislav Sýkora', 'ZP': 'František Žilka'}

In [4]:
def get_book_filename_data(bible_file_name:str):
    bible_file_name = bible_file_name[:-4]
    filename_parts = bible_file_name.split('_')

    translation = filename_parts[1]
    book_name = filename_parts[2]

    return translation, book_name


def analise_the_bible():
    """
    This function analyses all data in the Bible_files directory. You can change split_verse settings if you have created your dataset with different settings.
    
    It prints following statistics:
    - number of translations and their abbreviated names
    - number of books (in aggregation and for each translation separately)
    - number of verses (by verse_id, all in aggregation, and for each translation separately)
    - number of subverses (in aggregation and for each translation separately)

    And it saves csv file with statistics of books, verses and subverses for each translation (into RESULTS_PATH (by defaut now PUBLIC_RESULTS directory) --> statistics).
    """
    all_bible_files = os.listdir(BIBLES_PATH)

    translations = []
    
    books_names = []
    books_aggregated = 0
    books_per_trsl = defaultdict(int)

    verse_ids = []
    verses_aggregated = 0
    verses_per_trsl = defaultdict(int)

    subverses_aggregated = 0
    subverses_per_trsl = defaultdict(int)

    words_aggregated = 0
    words_per_trsl = defaultdict(int)

    for bible_file in all_bible_files:
        translation, book_name = get_book_filename_data(bible_file)
        
        translations.append(translation)

        books_names.append(book_name)
        books_aggregated += 1
        books_per_trsl[translation] += 1

        with open(os.path.join(BIBLES_PATH, bible_file), 'r', encoding='utf-8') as b_file:
            data = b_file.read()
            verses = eval(data)

            for verse_id in verses:
                verse_ids.append(verse_id)
                verses_aggregated += 1
                verses_per_trsl[translation] += 1

                words_in_verse = bip.word_tokenize_no_punctuation(verses[verse_id])
                words_per_trsl[translation] += len(words_in_verse)
                words_aggregated += len(words_in_verse)

                subverses = split_verse(verses[verse_id])

                subverses_aggregated += len(subverses)
                subverses_per_trsl[translation] += len(subverses)

    translations = list(set(translations))

    print(translations)
    print('Numebr of translations:', len(translations))

    print()

    books_names = list(set(books_names))
    print('Number of available books:', len(books_names))
    print('Number of books accross translations:', books_aggregated)
    for trsl in books_per_trsl:
        print('Number of books in', trsl, 'is', books_per_trsl[trsl])

    print()

    verse_ids = list(set(verse_ids))
    print('Number of available verses:', len(verse_ids))
    print('Number of verses accross translations:', verses_aggregated)
    for trsl in verses_per_trsl:
        print('Number of verses in', trsl, 'is', verses_per_trsl[trsl])

    print()
    
    print('Total number of subverses:', subverses_aggregated)
    for trsl in subverses_per_trsl:
        print('Number of subverses in', trsl, 'is', subverses_per_trsl[trsl])

    print('Total number of words:', words_aggregated)
    for trsl in words_per_trsl:
        print('Number of words in', trsl, 'is', words_per_trsl[trsl])

    df_dict = {'translation': [], 'books': [], 'verses': [], 'subverses': [], 'words': []}
    for trsl in translations:
        df_dict['translation'].append(trsl)
        df_dict['books'].append(books_per_trsl[trsl])
        df_dict['verses'].append(verses_per_trsl[trsl])
        df_dict['subverses'].append(subverses_per_trsl[trsl])
        df_dict['words'].append(words_per_trsl[trsl])

    
    bible_stats_df = pd.DataFrame(df_dict)
    bible_stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'bible_stats.csv'))


In [90]:
analise_the_bible()

['BKR', 'BSV', 'SYK', 'ZP', 'HEJCL']
Numebr of translations: 5

Number of available books: 74
Number of books accross translations: 202
Number of books in BKR is 66
Number of books in BSV is 37
Number of books in HEJCL is 47
Number of books in SYK is 26
Number of books in ZP is 26

Number of available verses: 36745
Number of verses accross translations: 89485
Number of verses in BKR is 31172
Number of verses in BSV is 15326
Number of verses in HEJCL is 27896
Number of verses in SYK is 7541
Number of verses in ZP is 7550

Total number of subverses: 329867
Number of subverses in BKR is 121283
Number of subverses in BSV is 55660
Number of subverses in HEJCL is 103523
Number of subverses in SYK is 24786
Number of subverses in ZP is 24615
Total number of words: 1620772
Number of words in BKR is 594928
Number of words in BSV is 267069
Number of words in HEJCL is 505218
Number of words in SYK is 128436
Number of words in ZP is 125121


## Journals data statistics

The following section contains functions that facilitace journals statistics. Since the sournals data are not availabe within the GitHub repository, these functions are pretty much useless for you, unles you wish to analyse your own datasets.

- NOTE: this process also revelas some mistakes in the dataset. For example Čech no. 26 from 1935 has no marked date in our dataset and [online](https://kramerius5.nkp.cz/view/uuid:334149b0-877c-11e6-8aeb-5ef3fc9ae867) is marek as issued in 1927. In general, such mistakes are ignored because statistically insignificant, but the researches should be avere of these problems (and may repair such mistakes in their datasets).

In [5]:
def analyse_journals():
    """
    This function analyses the available journal files (based on the 'journals_fulldata.joblib' file)

    It prints following data:
    - number of analysed journals
    - number of years that each journal covers
    - number of issues per journal
    - number of pages per journal
    - number of charaters per journal
    - average characters per page per journal
    """
    # NOTE: because we are following only years 1925-1939, we are ignoring those that are outside of this scope... '1937-1938' is added because it fits the profile, too...
    years_to_consider = ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1937-1938']

    journals_fulldata_dict = joblib.load(JOURNAL_FULLDATA_PATH)

    journals = []
    years_per_yournal = defaultdict(list)
    issues_per_journal = defaultdict(list)
    pages_per_journal = defaultdict(int)
    characters_per_journal = defaultdict(int)

    words_aggregated = 0
    words_per_journal = defaultdict(int)

    uuids_out_of_the_scope = []
    years_out_of_scope = []
    uuids_without_date = []
    
    for uuid_file in journals_fulldata_dict:
        uuid_file_data = journals_fulldata_dict[uuid_file]
        
        journal = uuid_file_data['journal']
        issue_date = uuid_file_data['issue_date']
        
        if issue_date == '':
            uuids_without_date.append(uuid_file)
            continue
        else:
            date_parts = issue_date.split('.')
            issue_year = date_parts[-1]

            if issue_year in years_to_consider:
                if issue_year not in years_per_yournal[journal]:
                    if issue_year == '1937-1938':
                        continue
                    else:
                        years_per_yournal[journal].append(issue_year)
        
                if journal not in journals:
                    journals.append(journal)                

                uuid = uuid_file_data['issue_uuid']
                if uuid not in issues_per_journal[journal]:
                    issues_per_journal[journal].append(uuid)

                pages_per_journal[journal] += 1

                characters_per_journal[journal] += len(uuid_file_data['text'])

                words_in_page = len(bip.word_tokenize_no_punctuation(uuid_file_data['text']))
                words_per_journal[journal] += words_in_page
                words_aggregated += words_in_page

            
            else:
                uuids_out_of_the_scope.append(uuid_file)
                years_out_of_scope.append(issue_year)

    print('Number of analysed journals:', len(journals))

    print()

    for journal in journals:
        years_per_yournal[journal].sort()
        print(journal)
        print(f'\tYears in journal ({len(years_per_yournal[journal])}):', years_per_yournal[journal])
        print('\tIssues in journal:', len(issues_per_journal[journal]))
        print('\tPages in journal:', pages_per_journal[journal])
        print('\tCharacter in journal:', characters_per_journal[journal])
        print('\tWords in journal:', words_per_journal[journal])
        print('\tAverage characters per page in journal:', characters_per_journal[journal]/pages_per_journal[journal])

    print()

    print('Total number of words in analysed journals:', words_aggregated)

    print()

    print('Number of files that do not fit the time range:', len(uuids_out_of_the_scope), set(years_out_of_scope))
    print('Number of files that do not have date in the metadata:', len(uuids_without_date))

    df_dict = {'journal': [], 'years': [], 'issues': [], 'pages': [], 'standard pages': [], 'words': [], 'characters': []}
    for journal in journals:
        df_dict['journal'].append(journal)
        df_dict['years'].append(len(years_per_yournal[journal]))
        df_dict['issues'].append(len(issues_per_journal[journal]))
        df_dict['pages'].append(pages_per_journal[journal])
        df_dict['standard pages'].append(characters_per_journal[journal]/1800)
        df_dict['words'].append(words_per_journal[journal])
        df_dict['characters'].append(characters_per_journal[journal])

    
    journals_stats_df = pd.DataFrame(df_dict)
    journals_stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'journals_stats.csv'))

In [92]:
analyse_journals()

Number of analysed journals: 11

Čech
	Years in journal (10): ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934']
	Issues in journal: 2368
	Pages in journal: 15678
	Character in journal: 145508697
	Words in journal: 22978292
	Average characters per page in journal: 9281.075200918485
Československý zemědělec
	Years in journal (15): ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939']
	Issues in journal: 753
	Pages in journal: 9278
	Character in journal: 91147459
	Words in journal: 14081975
	Average characters per page in journal: 9824.04171157577
Český učitel
	Years in journal (15): ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939']
	Issues in journal: 534
	Pages in journal: 11580
	Character in journal: 57198594
	Words in journal: 8837564
	Average characters per page in journal: 4939.429533678756
Moravský hospodář
	Years i

## Runtime statistics
Statistics of runtimes - based on batches.csv file (the runtimes are always averages per batch, not per single documents!)

In [6]:
def analyse_runtimes():
    """
    This function analyses runtimes from the batches.csv file.
    """
    df = pd.read_csv(os.path.join(ROOT_PATH, 'batches.csv'), quotechar='"', delimiter=',', encoding='utf-8')

    journals_full_names = ['Moravský hospodář', 'Polední list', 'Moravský večerník', 'Věstník katolického duchovenstva', 'Přítomnost', 'Venkov', 'Studentský časopis', 'Československý zemědělec', 'Český učitel', 'Čech', 'Posel záhrobní']
    journals_filenames = ['moravsky_hospodar', 'poledni_list', 'moravsky_vecernik', 'vestnik_katolickeho_duchovenstva', 'pritomnost', 'venkov', 'studentsky_casopis', 'ceskoslovensky_zemedelec', 'cesky_ucitel', 'cech', 'posel_zahrobni']

    all_times = []
    results = {}
    
    for i, journal in enumerate(journals_filenames):
        subset_dataframe = df[df['journal'] == journal]
        journal_runtimes = defaultdict(int)
        for row_id in subset_dataframe.index:
            runtime = subset_dataframe.loc[row_id]['runtime']
            if runtime >= 15:
                # ignoring wierdly long runtimes (probably due to computer in sleep mode...)
                continue
            else:
                journal_runtimes[round(runtime, 1)] += 1
                all_times.append(round(runtime, 1))

        print(journals_full_names[i], journal_runtimes)
        results[journals_full_names[i]] = journal_runtimes
    
    all_times_full = [round(i, 1) for i in np.arange(0, 10, 0.1)]

    output_dict = {}
    out_idx = 0
    
    for res in results:
        journal_data = {}
        journal_data['journal'] = res
        for timestamp in all_times_full:
            try:
                journal_data[timestamp] = results[res][timestamp]
            except IndexError:
                journal_data[timestamp] = 0
        
        output_dict[out_idx] = journal_data
        out_idx += 1

    output_df = pd.DataFrame.from_dict(output_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'runtimes_stats.csv'), quotechar='"', sep=',', encoding='utf-8')


In [38]:
analyse_runtimes()

Moravský hospodář defaultdict(<class 'int'>, {0.8: 1880, 0.7: 1880, 0.9: 520, 0.6: 1200, 0.5: 160, 1.1: 40, 1.8: 3})
Polední list defaultdict(<class 'int'>, {2.4: 3480, 2.5: 1960, 2.6: 2360, 2.2: 640, 2.3: 1600, 2.7: 360, 2.8: 280, 2.9: 40, 2.1: 240, 1.0: 19})
Moravský večerník defaultdict(<class 'int'>, {1.8: 5480, 2.1: 3600, 2.0: 4480, 1.9: 4280, 1.6: 2160, 1.7: 3000, 2.2: 1960, 2.3: 720, 1.4: 160, 1.5: 1000, 2.4: 673, 1.3: 80, 2.6: 120, 2.7: 40, 2.5: 160})
Věstník katolického duchovenstva defaultdict(<class 'int'>, {1.9: 40, 1.0: 400, 1.3: 120, 1.2: 480, 1.1: 680, 0.9: 200, 1.4: 40, 0.0: 2})
Přítomnost defaultdict(<class 'int'>, {0.9: 40, 1.5: 4080, 1.3: 1800, 1.4: 4396, 1.6: 2600, 1.2: 880, 1.7: 640, 1.8: 320, 2.1: 40, 1.1: 80, 2.8: 40, 1.9: 40, 2.0: 40})
Venkov defaultdict(<class 'int'>, {2.4: 760, 3.6: 920, 3.2: 7480, 3.4: 2880, 2.9: 7760, 2.8: 7240, 3.5: 1440, 3.1: 8960, 3.3: 4480, 3.8: 320, 3.0: 10120, 2.7: 3560, 2.6: 2080, 2.5: 800, 2.2: 400, 2.3: 240, 3.7: 280, 4.3: 120, 3.9:

## Results statistics

The following section contains functions that facilitate results stiatistics

### Overall statistics for journals

In [192]:
all_journals = ['Moravský hospodář', 'Polední list', 'Moravský večerník', 'Věstník katolického duchovenstva', 'Přítomnost', 'Venkov', 'Studentský časopis', 'Československý zemědělec', 'Český učitel', 'Čech', 'Posel záhrobní']

def ciation_counts_per_journal(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    print('Total number of citations:', len(citations_dataframe))
    
    citations = {}
    
    for journal in all_journals:
        journal_citations_df = citations_dataframe[citations_dataframe['journal'] == journal]
        print(journal, len(journal_citations_df))
        citations[journal] = len(journal_citations_df)

    return citations


def ciation_counts_per_journal_not_drop(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    
    print('Total number of citations:', len(citations_dataframe))
    
    citations = {}
    
    for journal in all_journals:
        journal_citations_df = citations_dataframe[citations_dataframe['journal'] == journal]
        print(journal, len(journal_citations_df))
        citations[journal] = len(journal_citations_df)

    return citations


def sure_ciations_counts_per_journal(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]
    
    print('Total number of citations:', len(citations_dataframe))
    
    citations = {}
    
    for journal in all_journals:
        journal_citations_df = citations_dataframe[citations_dataframe['journal'] == journal]
        print(journal, len(journal_citations_df))
        citations[journal] = len(journal_citations_df) 
    
    return citations


def get_overall_result_stats():
    print('Filtered citations:')
    filtered_citations = ciation_counts_per_journal('FILTERED_UNFILTERED_batch_results.csv')
    print('\nStop subs:')
    after_stop_subs = ciation_counts_per_journal('ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\nMultiple attrs:')
    multiple_attrs = ciation_counts_per_journal_not_drop('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\n"Sure" citations:')
    sure_cites = sure_ciations_counts_per_journal('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    out_df_dict = {'journal': [], 'initial results': [], 'filtered stop-subverses': [], 'filtered multiple attributions': [], '"sure" citations': []}

    for journal in all_journals:
        out_df_dict['journal'].append(journal)
        out_df_dict['initial results'].append(filtered_citations[journal])
        out_df_dict['filtered stop-subverses'].append(after_stop_subs[journal])
        out_df_dict['filtered multiple attributions'].append(multiple_attrs[journal])
        out_df_dict['"sure" citations'].append(sure_cites[journal])

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'journals_results_stats.csv'))   


In [251]:
get_overall_result_stats()

Filtered citations:
Total number of citations: 38039
Moravský hospodář 285
Polední list 2334
Moravský večerník 4406
Věstník katolického duchovenstva 2008
Přítomnost 2205
Venkov 14309
Studentský časopis 506
Československý zemědělec 1154
Český učitel 1110
Čech 8277
Posel záhrobní 1445

Stop subs:
Total number of citations: 12137
Moravský hospodář 4
Polední list 209
Moravský večerník 599
Věstník katolického duchovenstva 1529
Přítomnost 559
Venkov 2212
Studentský časopis 159
Československý zemědělec 206
Český učitel 386
Čech 5237
Posel záhrobní 1037

Multiple attrs:
Total number of citations: 7103
Moravský hospodář 2
Polední list 147
Moravský večerník 381
Věstník katolického duchovenstva 772
Přítomnost 342
Venkov 1528
Studentský časopis 102
Československý zemědělec 127
Český učitel 225
Čech 3022
Posel záhrobní 455

"Sure" citations:
Total number of citations: 1239
Moravský hospodář 0
Polední list 17
Moravský večerník 35
Věstník katolického duchovenstva 219
Přítomnost 53
Venkov 180
Students

### Citations in years

In [193]:
years_to_plot = ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939']

def ciation_counts_per_year(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        citations[year] += 1

    return citations


def ciation_counts_per_year_not_drop(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        citations[year] += 1

    return citations


def sure_ciations_counts_per_year(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        citations[year] += 1
    
    return citations


def results_in_years():
    print('Filtered citations:')
    filtered_citations = ciation_counts_per_year('FILTERED_UNFILTERED_batch_results.csv')
    print(filtered_citations)
    print('\nStop subs:')
    after_stop_subs = ciation_counts_per_year('ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print(after_stop_subs)
    print('\nMultiple attrs:')
    multiple_attrs = ciation_counts_per_year_not_drop('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print(multiple_attrs)
    print('\n"Sure" citations:')
    sure_cites = sure_ciations_counts_per_year('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print(sure_cites)

    out_df_dict = {'year': [], 'initial results': [], 'filtered stop-subverses': [], 'filtered multiple attributions': [], '"sure" citations': []}

    for year in years_to_plot:
        out_df_dict['year'].append(year)
        out_df_dict['initial results'].append(filtered_citations[year])
        out_df_dict['filtered stop-subverses'].append(after_stop_subs[year])
        out_df_dict['filtered multiple attributions'].append(multiple_attrs[year])
        out_df_dict['"sure" citations'].append(sure_cites[year])

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'years_results_stats.csv'))   

In [250]:
results_in_years()

Filtered citations:
defaultdict(<class 'int'>, {'1927': 3276, '1926': 2687, '1929': 2613, '1928': 2554, '1930': 2899, '1925': 3103, '1934': 2192, '1931': 3127, '1933': 3059, '1932': 2510, '1935': 2027, '1938': 1946, '1937': 2085, '1936': 2067, '1939': 1894})

Stop subs:
defaultdict(<class 'int'>, {'1927': 1336, '1926': 941, '1928': 828, '1930': 893, '1925': 1380, '1934': 827, '1933': 717, '1932': 699, '1929': 874, '1931': 1144, '1935': 577, '1938': 444, '1936': 481, '1939': 479, '1937': 517})

Multiple attrs:
defaultdict(<class 'int'>, {'1927': 758, '1926': 588, '1928': 535, '1930': 520, '1925': 803, '1934': 445, '1933': 450, '1932': 429, '1929': 537, '1931': 608, '1935': 321, '1938': 275, '1936': 273, '1939': 270, '1937': 291})

"Sure" citations:
defaultdict(<class 'int'>, {'1934': 121, '1926': 68, '1932': 76, '1928': 76, '1927': 130, '1933': 85, '1931': 92, '1925': 133, '1929': 81, '1930': 81, '1935': 85, '1938': 45, '1936': 59, '1939': 44, '1937': 63})


### Top citations - full

In [194]:
def ciation_counts_per_verses(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        citations[verse_id] += 1

    return citations

def ciation_counts_per_verses_drop(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        citations[verse_id] += 1

    return citations

def ciation_counts_per_verses_sure(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]

    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        citations[verse_id] += 1

    return citations

def analyse_verses():
    print('Filtered citations:')
    filtered_citations = ciation_counts_per_verses('FILTERED_UNFILTERED_batch_results.csv')
    print('\nStop subs:')
    after_stop_subs = ciation_counts_per_verses('ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\nMultiple attrs:')
    multiple_attrs = ciation_counts_per_verses_drop('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\n"Sure" citations:')
    sure_cites = ciation_counts_per_verses_sure('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    out_df_dict = {'verse': [], 'initial results': [], 'filtered stop-subverses': [], 'filtered multiple attributions': [], '"sure" citations': []}

    for verse_id in filtered_citations:
        out_df_dict['verse'].append(verse_id)
        try:
            out_df_dict['initial results'].append(filtered_citations[verse_id])
        except:
            out_df_dict['initial results'].append(0)
        try:            
            out_df_dict['filtered stop-subverses'].append(after_stop_subs[verse_id])
        except:
            out_df_dict['filtered stop-subverses'].append(0)
        try:                
            out_df_dict['filtered multiple attributions'].append(multiple_attrs[verse_id])
        except:
            out_df_dict['filtered multiple attributions'].append(0)
        try:                
            out_df_dict['"sure" citations'].append(sure_cites[verse_id])
        except:
            out_df_dict['"sure" citations'].append(0)

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'verse_results_stats.csv'))   

In [249]:
analyse_verses()

Filtered citations:

Stop subs:

Multiple attrs:

"Sure" citations:


In [195]:
def ciation_counts_per_verses_for_top(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    citations = defaultdict(int)
    verse_texts = defaultdict(list)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        verse_text = row_dict['verse_text']
        citations[verse_id] += 1
        if verse_text not in verse_texts[verse_id]:
            verse_texts[verse_id].append(verse_text)

    return citations, verse_texts


def analyse_verses_top():
    """ Here we count only with filtered stop-subs and resolved multiple attributions. """
    citations, verse_texts = ciation_counts_per_verses_for_top('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    citation_counts = []
    for verse_id in citations:
        citation_counts.append(citations[verse_id])

    citation_counts.sort(reverse=True)    

    out_df_dict = {'verse': [], 'citation count': [], 'verse texts': [], 'filter row': []}

    for verse_id in citations:
        if citations[verse_id] in citation_counts[:10]:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('top 10')
        elif citations[verse_id] in citation_counts[10:20]:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('top 20')
        elif citations[verse_id] in citation_counts[20:30]:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('top 30')
        else:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('low')


    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'top_verse_results_stats.csv'))   

In [206]:
analyse_verses_top()

### Top citations - to plot in years

In [196]:
def return_top_x_verse_ids(x:int, results_filename:str):
    """ This function returns top x citations from results dataframe. """

    citations, verse_texts = ciation_counts_per_verses_for_top(results_filename)

    citation_counts = []
    for verse_id in citations:
        citation_counts.append(citations[verse_id])

    citation_counts.sort(reverse=True)

    top_verses = []

    for verse_id in citations:
        if citations[verse_id] in citation_counts[:x]:
            top_verses.append(verse_id)

    return top_verses

In [197]:
def ciation_counts_per_verses_in_years(results_filename:str, verese_ids_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
        if verse_id in verese_ids_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1

    return citations


def ciation_counts_per_verses_in_years_drop(results_filename:str, verese_ids_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    
    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
        if verse_id in verese_ids_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1

    return citations


def ciation_counts_per_verses_in_years_sure(results_filename:str, verese_ids_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]

    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
        if verse_id in verese_ids_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1


def analyse_verse_ids_counts_in_years(top_x:int):
    verese_ids_to_plot = return_top_x_verse_ids(x=top_x, results_filename='MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    citations = ciation_counts_per_verses_in_years_drop('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv', verese_ids_to_plot)

    out_df_dict = defaultdict(list)

    for year in citations:
        out_df_dict['year'].append(year)
        for verse_id in verese_ids_to_plot:
            try:
                out_df_dict[verse_id].append(citations[year][verse_id])
            except:
                out_df_dict[verse_id].append(0)

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'verses_in_years_results_stats.csv'))   

In [207]:
analyse_verse_ids_counts_in_years(10)

### Subset statistics


In [198]:
""" Defining subsets books"""
old_testament_books = ['1Kr', '1Pa', '1S', '2Kr', '2Pa', '2S', 'Abd', 'Abk', 'Ag', 'Am', 'Da', 'Dt', 'Est', 'Ex', 'Ez', 'Ezd', 'Gn', 'Iz', 'Jb', 'Jl', 'Jon', 'Joz', 'Jr', 'Kaz', 'Lv', 'Mal', 'Mi', 'Na', 'Neh', 'Nu', 'Oz', 'Pis', 'Pl', 'Pr', 'Rt', 'Sd', 'Sf', 'Z', 'Za']
deuterocanoncal_books = ['1Ma', '2Ma', 'Bar', 'Jud', 'Mou', 'Sir', 'Tob', 'Zuz']
gospels = ['Mk', 'Mt', 'L', 'J']
new_testament_books_no_gospels = ['1J', '1K', '1P', '1Te', '1Tm', '2J', '2K', '2P', '2Te', '2Tm', '3J', 'Ef', 'Fm', 'Fp', 'Ga', 'Jk', 'Ju', 'Ko', 'R', 'Sk', 'Tit', 'Zd', 'Zj']
new_testament_books = ['1J', '1K', '1P', '1Te', '1Tm', '2J', '2K', '2P', '2Te', '2Tm', '3J', 'Ef', 'Fm', 'Fp', 'Ga', 'Jk', 'Ju', 'Ko', 'R', 'Sk', 'Tit', 'Zd', 'Zj', 'Mk', 'Mt', 'L', 'J']

""" Defining subsets verses"""
ten_commandments_verses = ['Ex 20:2', 'Ex 20:3', 'Ex 20:4', 'Ex 20:5', 'Ex 20:6', 'Ex 20:7', 'Ex 20:8', 'Ex 20:9', 'Ex 20:10', 'Ex 20:11', 'Ex 20:12', 'Ex 20:13', 'Ex 20:14', 'Ex 20:15', 'Ex 20:16', 'Ex 20:17', 'Dt 5:6', 'Dt 5:7', 'Dt 5:8', 'Dt 5:9', 'Dt 5:10', 'Dt 5:11', 'Dt 5:12', 'Dt 5:13', 'Dt 5:14', 'Dt 5:15', 'Dt 5:16', 'Dt 5:17', 'Dt 5:18', 'Dt 5:19', 'Dt 5:20', 'Dt 5:21']

In [237]:
def get_book_name(verse_id:str):
    return verse_id.split(' ')[0]


def return_top_x_verse_ids_in_subset_books(x:int, results_filename:str, subset_books:list):
    """ This function returns top x citations from results dataframe. """

    citations, verse_texts = ciation_counts_per_verses_for_top(results_filename)

    citation_counts = []
    verses_in_book = []
    for verse_id in citations:
        if '/'in verse_id:
            shared_verse_ids = verse_id.split('/')
            is_in_subset = False
            for shared_verse in shared_verse_ids:
                book_name = get_book_name(shared_verse)
                if book_name in subset_books:
                    citation_counts.append(citations[verse_id])
                    verses_in_book.append(verse_id)
                    break
        else:
            book_name = get_book_name(verse_id)
            if book_name in subset_books:
                citation_counts.append(citations[verse_id])
                verses_in_book.append(verse_id)

    citation_counts.sort(reverse=True)

    top_verses = []

    for verse_id in verses_in_book:
        if citations[verse_id] in citation_counts[:x]:
            top_verses.append(verse_id)

    return top_verses


def ciation_counts_per_verses_in_years_subset_verses(results_filename:str, verse_ids_to_plot:list, csv_delimiter=';', plot_shared=False):
    print(verse_ids_to_plot)
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    citations = defaultdict(dict)

    plotted_verses = []

    if plot_shared:
        for row_id in citations_dataframe.index:
            row_dict = citations_dataframe.loc[row_id].to_dict()
            year = row_dict['date'].split('.')[-1]
            verse_id = row_dict['verse_id']

            if verse_id in verse_ids_to_plot:
                if verse_id not in plotted_verses:
                    plotted_verses.append(verse_id)
                try:
                    citations[year][verse_id] += 1
                except KeyError:
                    citations[year][verse_id] = 1
            
            else:
                if '/' in verse_id:
                    shared_vereses = verse_id.split('/')
                    plot = False
                    for shared_verse in shared_vereses:
                        if shared_verse in verse_ids_to_plot:
                            plot = True
                            break
                    if plot:
                        if verse_id not in plotted_verses:
                            plotted_verses.append(verse_id)
                        try:
                            citations[year][verse_id] += 1
                        except KeyError:
                            citations[year][verse_id] = 1
    
    else:
        for row_id in citations_dataframe.index:
            row_dict = citations_dataframe.loc[row_id].to_dict()
            year = row_dict['date'].split('.')[-1]
            verse_id = row_dict['verse_id']

            if verse_id in verse_ids_to_plot:
                if verse_id not in plotted_verses:
                    plotted_verses.append(verse_id)
                try:
                    citations[year][verse_id] += 1
                except KeyError:
                    citations[year][verse_id] = 1

    return citations, plotted_verses


def subset_statistics_books(top_x:int, subset_books:list, resulst_prefix_name:str):
    verese_ids_to_plot = return_top_x_verse_ids_in_subset_books(x=top_x, results_filename='MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv', subset_books=subset_books)

    citations, plotted_verses = ciation_counts_per_verses_in_years_subset_verses('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv', verese_ids_to_plot)

    out_df_dict = defaultdict(list)

    for year in citations:
        out_df_dict['year'].append(year)
        for verse_id in plotted_verses:
            try:
                out_df_dict[verse_id].append(citations[year][verse_id])
            except:
                out_df_dict[verse_id].append(0)

    stats_df = pd.DataFrame(out_df_dict)
    stats_df = stats_df.sort_values(by='year')
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', f'{resulst_prefix_name}_books_subset_results_stats.csv'))


def subset_statistics_verses(verese_ids_to_plot:list, resulst_prefix_name:str):
    citations, plotted_verses = ciation_counts_per_verses_in_years_subset_verses('MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv', verese_ids_to_plot, plot_shared=True)

    print(plotted_verses)

    out_df_dict = defaultdict(list)

    for year in citations:
        out_df_dict['year'].append(year)
        for verse_id in plotted_verses:
            try:
                out_df_dict[verse_id].append(citations[year][verse_id])
            except:
                out_df_dict[verse_id].append(0)

    stats_df = pd.DataFrame(out_df_dict)
    stats_df = stats_df.sort_values(by='year')
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', f'{resulst_prefix_name}_verses_subset_results_stats.csv'))   

In [238]:
subset_statistics_verses(verese_ids_to_plot=ten_commandments_verses, resulst_prefix_name='ten_commandments')

['Ex 20:2', 'Ex 20:3', 'Ex 20:4', 'Ex 20:5', 'Ex 20:6', 'Ex 20:7', 'Ex 20:8', 'Ex 20:9', 'Ex 20:10', 'Ex 20:11', 'Ex 20:12', 'Ex 20:13', 'Ex 20:14', 'Ex 20:15', 'Ex 20:16', 'Ex 20:17', 'Dt 5:6', 'Dt 5:7', 'Dt 5:8', 'Dt 5:9', 'Dt 5:10', 'Dt 5:11', 'Dt 5:12', 'Dt 5:13', 'Dt 5:14', 'Dt 5:15', 'Dt 5:16', 'Dt 5:17', 'Dt 5:18', 'Dt 5:19', 'Dt 5:20', 'Dt 5:21']
['Ex 20:13/Dt 5:17', 'Ex 20:2/Dt 5:6', 'Ex 20:5', 'Ex 20:15/Dt 5:19', 'Ex 20:12/Dt 5:16/Ef 6:2', 'Ex 20:3/Dt 5:7', 'Ex 20:14/Dt 5:18', 'Dt 5:21', 'Ex 20:16/Dt 5:20', 'Dt 5:9', 'Ex 20:8']


In [229]:
subset_statistics_books(top_x=10, subset_books=old_testament_books, resulst_prefix_name='old_testament')
subset_statistics_books(top_x=10, subset_books=new_testament_books, resulst_prefix_name='new_testament')
subset_statistics_books(top_x=10, subset_books=new_testament_books_no_gospels, resulst_prefix_name='new_testament_no_gospels')
subset_statistics_books(top_x=10, subset_books=gospels, resulst_prefix_name='gospels')
subset_statistics_books(top_x=10, subset_books=deuterocanoncal_books, resulst_prefix_name='deuterocanonical')
subset_statistics_verses(verese_ids_to_plot=ten_commandments_verses, resulst_prefix_name='ten_commandments')


['Mt 3:3/J 1:23/Mk 1:3/Iz 40:3', 'Ex 20:13/Dt 5:17', 'Ex 20:15/Dt 5:19', 'Z 127:1', 'Ex 20:12/Dt 5:16/Ef 6:2', 'Gn 3:19', 'Pr 26:27', '2S 14:5', 'Ex 20:14/Dt 5:18', 'Iz 45:8', 'Pr 31:10', 'Ez 24:19']
['Mt 5:38', 'L 3:4', 'Mt 3:3/J 1:23/Mk 1:3/Iz 40:3', 'L 2:14', 'Mt 11:28', 'Mt 6:11/L 11:3', 'Ex 20:12/Dt 5:16/Ef 6:2', 'Mt 16:18', 'L 20:25', 'Mt 20:16']
['1J 5:6', 'R 13:11', 'Sk 5:29', '2P 1:17', 'Ex 20:12/Dt 5:16/Ef 6:2', '2K 1:2/Fp 1:2/2Te 1:2/1K 1:3/Ef 1:2/Ga 1:3', '1K 15:20', '2Tm 4:7', 'Zd 5:9', 'Sk 2:17', '2K 2:6', '2K 7:10']
['Mt 5:38', 'L 3:4', 'Mt 3:3/J 1:23/Mk 1:3/Iz 40:3', 'L 2:14', 'Mt 11:28', 'Mt 6:11/L 11:3', 'Mt 27:25', 'Mt 16:18', 'L 18:42', 'L 20:25', 'Mt 20:16']
['2Ma 14:25', 'Tob 5:11', 'Tob 8:10', 'Sir 51:24', 'Sir 10:27', 'Tob 2:13', 'Mou 16:24', '2Ma 14:26', 'Sir 36:12', 'Jud 13:24', 'Jud 13:15', 'Sir 13:8', '2Ma 2:6', 'Sir 19:22', 'Zuz 1:46', 'Sir 35:21', '2Ma 4:18', 'Sir 33:15', 'Mou 19:4']
['Ex 20:2', 'Ex 20:3', 'Ex 20:4', 'Ex 20:5', 'Ex 20:6', 'Ex 20:7', 'Ex 20

### Map of meanings
The following functions are used to create a network chart that shows how individual journals are connected with themselves through the citations of same verses.

In [16]:
def get_verse_group(verse_id:str):
    """ This function returns the group of Biblical texts to which the verse belongs. """
    if verse_id.split('/')[0] in ten_commandments_verses or verse_id in ten_commandments_verses:
        return 'Desatero'
    
    else:
        book_id = get_book_name(verse_id)
        if book_id in old_testament_books:
            return 'Starý zákon'
        elif book_id in gospels:
            return 'Evangelia'
        elif book_id in new_testament_books_no_gospels:
            return 'Nový zákon'
        elif book_id in deuterocanoncal_books:
            return 'Deuterokanonické knihy'
        else:
            return 'neklasifikováno'
    

def network_graph_creation(results_filename='MUTUAL_DROP_SURE_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv', value_treshold=3, csv_delimiter=';', out_filename='map_of_meanings.csv'):
    """ This functions creates datafiles for network grpahs - map_of_meanings.csv (= file with links from journal to verse) and map_of_meanings_points.csv (metadata for points). 
    
    :param value_treshold: how many citations must there be, to consider the point in the dataset
    """
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    points_dict = {}
    for journal in all_journals:
        points_dict[journal] = {'size': 5, 'Group': journal, 'Verse text': journal}


    out_df_dict_collect = {}
    out_idx = 0

    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        journal = row_dict['journal']
        year = row_dict['date'].split('.')[-1]
        verse_text = row_dict['verse_text']

        if verse_id not in points_dict:
            group = get_verse_group(verse_id)
            points_dict[verse_id] = {'size': 1, 'Group': group, 'Verse text': verse_text}

        out_df_dict_collect[out_idx] = {'Source': journal, 'Target': verse_id}
        out_idx += 1
    
    values_dict ={}
    for idx in out_df_dict_collect:
        source = out_df_dict_collect[idx]['Source']
        target = out_df_dict_collect[idx]['Target']
        row_name = f'{source} {target}'
        if row_name in values_dict:
            values_dict[row_name]['Value'] += 1
        else:
            values_dict[row_name] = {'Source': source, 'Target': target, 'Value': 1}

    out_df_dict = {}
    out_points_dict = {}
    row_id = 0
    for row_name in values_dict:
        if values_dict[row_name]['Value'] >= value_treshold:
            out_df_dict[row_id] = values_dict[row_name]
            row_id += 1
            if values_dict[row_name]['Target'] not in out_points_dict:
                out_points_dict[values_dict[row_name]['Target']] = points_dict[values_dict[row_name]['Target']]
    
    for journal in all_journals:
        out_points_dict[journal] = {'size': 5, 'Group': f'Periodikum: {journal}', 'Verse text': journal}
    
    output_df = pd.DataFrame.from_dict(out_df_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', out_filename), quotechar='"', sep=',', encoding='utf-8')

    output_df_points = pd.DataFrame.from_dict(out_points_dict)
    output_df_points = output_df_points.transpose()

    output_df_points.to_csv(os.path.join(RESULTS_PATH, 'statistics', f'{out_filename[:-4]}_points.csv'), quotechar='"', sep=',', encoding='utf-8')

In [176]:
network_graph_creation()

#### Other versions of the map of meanings
##### 1) show only those points that connect two or more journals

In [17]:
def select_verses_shared_by_journals(map_of_meanings_data='map_of_meanings.csv', points_data='map_of_meanings_points.csv', csv_delimiter=','):
    """ This function prepares the map of meanings that show only the shared citations. """
    map_of_meanings_dataset = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', map_of_meanings_data), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    points_dataset = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', points_data), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    verse_counts = map_of_meanings_dataset.Target.value_counts()

    out_df_dict = {}
    row_id = 0

    for row_id in map_of_meanings_dataset.index:
        row_data = map_of_meanings_dataset.loc[row_id].to_dict()
        verse_id = row_data['Target']
        if verse_counts[verse_id] >= 2:
            out_df_dict[row_id] = row_data
            row_id += 1
        else:
            points_dataset = points_dataset.drop(verse_id)

    output_df = pd.DataFrame.from_dict(out_df_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', f'shared_{map_of_meanings_data}'), quotechar='"', sep=',', encoding='utf-8')

    points_dataset.to_csv(os.path.join(RESULTS_PATH, 'statistics', f'shared_{map_of_meanings_data[:-4]}_points.csv'), quotechar='"', sep=',', encoding='utf-8')


In [177]:
network_graph_creation(out_filename='map_of_meanings_treshold_1.csv', value_treshold=1)
select_verses_shared_by_journals(map_of_meanings_input='map_of_meanings_treshold_1.csv', points_data='map_of_meanings_treshold_1_points.csv')

##### 2) filtering based on: appears at least 3-5x, 6-10x, 11-15x, 16-20x, 20+

In [40]:
def map_of_meanings_by_citation_count(map_of_meanings_data='map_of_meanings_treshold_1.csv', points_data='map_of_meanings_treshold_1_points.csv', csv_delimiter=','):
    """ This function prepares the map of meanings that prapares the map of meanings that is filterable by how many times a verse citation appears. """
    map_of_meanings_dataset = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', map_of_meanings_data), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    points_dataset = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', points_data), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    points_groups = ['3-5', '6-10', '11-15', '16-20', '21+']

    out_points_dict = {}

    for row_id in points_dataset.index:
        row_data = points_dataset.loc[row_id].to_dict()
        # NOTE: verse points have size 1 in the points dataset
        if row_data['size'] == 1:
            # NOTE: row_id = verse_id
            verse_id_citation_count = 0
            subset_df = map_of_meanings_dataset.loc[map_of_meanings_dataset['Target'] == row_id]
            for subset_row_id in subset_df.index:
                sub_row_data = subset_df.loc[subset_row_id].to_dict()
                verse_id_citation_count += sub_row_data['Value']
            
            if verse_id_citation_count <= 2:
                row_data = None
            elif verse_id_citation_count <= 5:
                row_data['Group'] = '3-5'
            elif verse_id_citation_count <= 10:
                row_data['Group'] = '6-10'
            elif verse_id_citation_count <= 15:
                row_data['Group'] = '11-15'
            elif verse_id_citation_count <= 20:
                row_data['Group'] = '16-20'
            else:
                row_data['Group'] = '21+'

        if row_data:
            out_points_dict[row_id] = row_data

    output_df = pd.DataFrame.from_dict(out_points_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'by_value_map_of_meanings_points.csv'), quotechar='"', sep=',', encoding='utf-8')

In [41]:
map_of_meanings_by_citation_count()

##### 3) connect journals based on how many same verses they cite.

In [43]:
def map_of_meanings_just_journals(map_of_meanings_data='map_of_meanings.csv', points_data='map_of_meanings_points.csv', csv_delimiter=','):
    """ This function prepares the map of meanings that prapares the map of meanings that is filterable by how many times a verse citation appears. """
    map_of_meanings_dataset = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', map_of_meanings_data), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    points_dataset = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', points_data), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    out_points_dict = {}
    out_links_dict = {}
    links_row_id = 0

    for row_id in points_dataset.index:
        row_data = points_dataset.loc[row_id].to_dict()
        # NOTE: verse points have size 1 in the points dataset
        if row_data['size'] == 1:
            # NOTE: row_id = verse_id
            verse_id_citation_count = 0
            subset_df = map_of_meanings_dataset.loc[map_of_meanings_dataset['Target'] == row_id]
            journals_for_this_verse = subset_df['Source'].tolist()
            for journal_from in journals_for_this_verse:
                reverses_now = []
                for journal_to in journals_for_this_verse:
                    if journal_from == journal_to:
                        continue
                    elif f'{journal_to}-{journal_from}' in reverses_now:
                        continue
                    else:
                        if f'{journal_from}-{journal_to}' in out_links_dict:
                            out_links_dict[f'{journal_from}-{journal_to}']['Shared'] += 1
                        else:
                            out_links_dict[f'{journal_from}-{journal_to}'] = {}
                            out_links_dict[f'{journal_from}-{journal_to}']['Source'] = journal_from
                            out_links_dict[f'{journal_from}-{journal_to}']['Target'] = journal_to
                            out_links_dict[f'{journal_from}-{journal_to}']['Shared'] = 1

        else:
            # NOTE: if the size is not 1 than row_id = journal name
            # Here, we set the size of journal points according to their aggregated citation value
            verse_id_citation_count = 0
            subset_df = map_of_meanings_dataset.loc[map_of_meanings_dataset['Source'] == row_id]
            for subset_row_id in subset_df.index:
                sub_row_data = subset_df.loc[subset_row_id].to_dict()
                #verse_id_citation_count += sub_row_data['Value']
                verse_id_citation_count += 1

            row_data['size'] = int(verse_id_citation_count)
            out_points_dict[row_id] = row_data

    for row_id in out_links_dict:
        row_data = out_links_dict[row_id]
        source = row_data['Source']
        target = row_data['Target']  
        shared = row_data['Shared']

        source_full = out_points_dict[source]['size']
        target_full = out_points_dict[target]['size']

        proportion = (shared)/(source_full+target_full-shared)
        out_links_dict[row_id]['Value'] = proportion

    output_df = pd.DataFrame.from_dict(out_links_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'journals_map_of_meanings.csv'), quotechar='"', sep=',', encoding='utf-8')

    output_points_df = pd.DataFrame.from_dict(out_points_dict)
    output_points_df = output_points_df.transpose()

    output_points_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'journals_map_of_meanings_points.csv'), quotechar='"', sep=',', encoding='utf-8')

In [44]:
map_of_meanings_just_journals(map_of_meanings_data='map_of_meanings_treshold_1.csv', points_data='map_of_meanings_treshold_1_points.csv')

### Change verse IDs from Czech format to SBL markings.
Because we are working with Czech translations of the Bible, we use Czech designation of books and therefore also verses. With the following function, you can change verse IDs in any result CSV file into the SBL-like designation.

In [23]:
""" Define concordances """

cze_to_sbl_book_ids = {'Gn': 'Gen', 'Ex': 'Exod', 'Lv': 'Lev', 'Nu': 'Num', 'Dt': 'Deut', 'Joz': 'Josh', 'Sd': 'Judg', 'Rt': 'Ruth', '1S': '1Sam', '2S': '2Sam', '1Kr': '1Kgs', '2Kr': '2Kgs', '1Pa': '1Chr', '2Pa': '2Chr', 'Ezd': 'Ezra', 'Neh': 'Neh', 'Est': 'Esth', 'Jb': 'Job', 'Z': 'Ps', 'Pr': 'Prov', 'Kaz': 'Eccl', 'Pis': 'Song', 'Iz': 'Isa', 'Jr': 'Jer', 'Pl': 'Lam', 'Ez': 'Ezek', 'Da': 'Dan', 'Oz': 'Hos', 'Jl': 'Joel', 'Am': 'Amos', 'Abd': 'Obad', 'Jon': 'Jonah', 'Mi': 'Mic', 'Na': 'Nah', 'Abk': 'Hab', 'Sf': 'Zeph', 'Ag': 'Hag', 'Za': 'Zech', 'Mal': 'Mal', 'Tob': 'Tob', 'Jud': 'Jdt', 'Mou': 'Wis', 'Sir': 'Sir', 'Bar': 'Bar', '1Ma': '1Macc', '2Ma': '2Macc', 'Mt': 'Matt', 'Mk': 'Mark', 'L': 'Luke', 'J': 'John', 'Sk': 'Acts', 'R': 'Rom', '1K': '1Cor', '2K': '2Cor', 'Ga': 'Gal', 'Ef': 'Eph', 'Fp': 'Phil', 'Ko': 'Col', '1Te': '1Thess', '2Te': '2Thess', '1Tm': '1Tim', '2Tm': '2Tim', 'Tit': 'Titus', 'Fm': 'Phlm', 'Zd': 'Heb', 'Jk': 'Jas', '1P': '1Pet', '2P': '2Pet', '1J': '1John', '2J': '2John', '3J': '3John', 'Ju': 'Jude', 'Zj': 'Rev', 'Zuz': 'Sus'}

In [20]:
def change_verse_ids_in_csv_columns(path_to_csv_file:str, csv_delimiter=',', verse_column_name='verse_id'):
    """ This function changes verse IDs from the Czech format to the SBL one. The input file must be CSV file with one column that includes verse IDs. It is then saved with preposition SBL_.
    :param verse_column_name: if the verse IDs are the index of the datarframe, set 0 """
    input_df = pd.read_csv(path_to_csv_file, quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    original_df = input_df.copy()

    if verse_column_name == 0:
        for row_id in input_df.index:
            verse_ids = row_id
            if ':' in verse_ids:
                verses_in_line = verse_ids.split('/')
                new_verse_id = ''
                for verse_id in verses_in_line:
                    book_name = verse_id.split(' ')[0]
                    new_verse_id_instance = verse_id.replace(book_name, cze_to_sbl_book_ids[book_name])
                    if new_verse_id == '':
                        new_verse_id = new_verse_id_instance
                    else:
                        new_verse_id += f'/{new_verse_id_instance}'
                input_df = input_df.rename({row_id: new_verse_id}, axis='index')
            else:
                continue

    else:
        for row_id in input_df.index:
            verse_ids = input_df.loc[row_id][verse_column_name]
            if ':' in verse_ids:
                verses_in_line = verse_ids.split('/')
                new_verse_id = ''
                for verse_id in verses_in_line:
                    book_name = verse_id.split(' ')[0]
                    new_verse_id_instance = verse_id.replace(book_name, cze_to_sbl_book_ids[book_name])
                    if new_verse_id == '':
                        new_verse_id = new_verse_id_instance
                    else:
                        new_verse_id += f'/{new_verse_id_instance}'
                input_df.at[row_id, verse_column_name] = new_verse_id
            else:
                continue

    # Check if there have been any changes and if so, save the new df
    if original_df.equals(input_df):
        original_filename = os.path.basename(path_to_csv_file)
        print(original_filename, 'has not changed')
        return
    else:
        original_filename = os.path.basename(path_to_csv_file)
        dir_csv = os.path.dirname(path_to_csv_file)
        input_df.to_csv(os.path.join(dir_csv, f'SBL_{original_filename}'), quotechar='"', sep=',', encoding='utf-8')
        print(f'SBL_{original_filename}', 'has been created')


def change_verse_ids_in_csv_header(path_to_csv_file:str, csv_delimiter=','):
    """ This function changes the verse IDs from the Czech format to the SBL-like format. This function is made for those dataframes that have the verse IDs as their column header. """
    input_df = pd.read_csv(path_to_csv_file, quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    original_df = input_df.copy()

    column_names = list(input_df.columns)

    for column in column_names:
        # just check if the column name has ":" in it - it means that it is a verse ID and not some other data
        if ':' in column:
            verses_in_line = column.split('/')
            new_verse_id = ''
            for verse_id in verses_in_line:
                book_name = verse_id.split(' ')[0]
                new_verse_id_instance = verse_id.replace(book_name, cze_to_sbl_book_ids[book_name])
                if new_verse_id == '':
                    new_verse_id = new_verse_id_instance
                else:
                    new_verse_id += f'/{new_verse_id_instance}'
            input_df = input_df.rename({column: new_verse_id}, axis=1)
        else:
            continue

    # Check if there have been any changes and if so, save the new df
    if original_df.equals(input_df):
        original_filename = os.path.basename(path_to_csv_file)
        print(original_filename, 'has not changed')
        return
    else:
        original_filename = os.path.basename(path_to_csv_file)
        dir_csv = os.path.dirname(path_to_csv_file)
        input_df.to_csv(os.path.join(dir_csv, f'SBL_{original_filename}'), quotechar='"', sep=',', encoding='utf-8')
        print(f'SBL_{original_filename}', 'has been created')


def change_all_statistics_to_SBL(input_directory=os.path.join(RESULTS_PATH, 'statistics')):
    """ This function changes all statistical results created with the functions in this notebook from the Czech format of verse IDs to the SBL-like format. """
    all_statistics_csvs = os.listdir(input_directory)

    colum_names_as_verse_ids = ['verse', 'verse_id', 'Target']

    for stats_csv in all_statistics_csvs:
        csv_path = os.path.join(input_directory, stats_csv)
        if 'SBL' in stats_csv:
            continue
        elif 'points' in stats_csv:
            change_verse_ids_in_csv_columns(csv_path, verse_column_name=0)
        else:
            input_df = pd.read_csv(os.path.join(input_directory, stats_csv), quotechar='"', delimiter=',', encoding='utf-8', index_col=0)
            column_names = list(input_df.columns)
            change_colums = False
            for column in column_names:
                if column in colum_names_as_verse_ids:
                    change_colums = True
                    break
            
            if change_colums:
                change_verse_ids_in_csv_columns(path_to_csv_file=csv_path, verse_column_name=column)
            else:
                change_verse_ids_in_csv_header(path_to_csv_file=csv_path)


In [239]:
change_all_statistics_to_SBL()

bible_stats.csv has not changed
SBL_by_value_map_of_meanings_points.csv has been created
SBL_deuterocanonical_books_subset_results_stats.csv has been created
SBL_gospels_books_subset_results_stats.csv has been created
journals_map_of_meanings.csv has not changed
journals_map_of_meanings_points.csv has not changed
journals_results_stats.csv has not changed
journals_stats.csv has not changed
SBL_map_of_meanings.csv has been created
SBL_map_of_meanings_points.csv has been created
SBL_map_of_meanings_treshold_1.csv has been created
SBL_map_of_meanings_treshold_1_points.csv has been created
SBL_new_testament_books_subset_results_stats.csv has been created
SBL_new_testament_no_gospels_books_subset_results_stats.csv has been created
SBL_old_testament_books_subset_results_stats.csv has been created
runtimes_stats.csv has not changed
SBL_shared_map_of_meanings_treshold_1.csv has been created
SBL_shared_map_of_meanings_treshold_1_points.csv has been created
SBL_ten_commandments_verses_subset_res

# FULL results stats creation

In [248]:
get_overall_result_stats()
results_in_years()
analyse_verses()
analyse_verses_top()
analyse_verse_ids_counts_in_years(10)

subset_statistics_books(top_x=10, subset_books=old_testament_books, resulst_prefix_name='old_testament')
subset_statistics_books(top_x=10, subset_books=new_testament_books, resulst_prefix_name='new_testament')
subset_statistics_books(top_x=10, subset_books=new_testament_books_no_gospels, resulst_prefix_name='new_testament_no_gospels')
subset_statistics_books(top_x=10, subset_books=gospels, resulst_prefix_name='gospels')
subset_statistics_books(top_x=10, subset_books=deuterocanoncal_books, resulst_prefix_name='deuterocanonical')
subset_statistics_verses(verese_ids_to_plot=ten_commandments_verses, resulst_prefix_name='ten_commandments')

network_graph_creation()
network_graph_creation(out_filename='map_of_meanings_treshold_1.csv', value_treshold=1)
select_verses_shared_by_journals(map_of_meanings_data='map_of_meanings_treshold_1.csv', points_data='map_of_meanings_treshold_1_points.csv')
map_of_meanings_by_citation_count()
map_of_meanings_just_journals(map_of_meanings_data='map_of_meanings_treshold_1.csv', points_data='map_of_meanings_treshold_1_points.csv')

change_all_statistics_to_SBL()

Filtered citations:
Total number of citations: 38039
Moravský hospodář 285
Polední list 2334
Moravský večerník 4406
Věstník katolického duchovenstva 2008
Přítomnost 2205
Venkov 14309
Studentský časopis 506
Československý zemědělec 1154
Český učitel 1110
Čech 8277
Posel záhrobní 1445

Stop subs:
Total number of citations: 12137
Moravský hospodář 4
Polední list 209
Moravský večerník 599
Věstník katolického duchovenstva 1529
Přítomnost 559
Venkov 2212
Studentský časopis 159
Československý zemědělec 206
Český učitel 386
Čech 5237
Posel záhrobní 1037

Multiple attrs:
Total number of citations: 7103
Moravský hospodář 2
Polední list 147
Moravský večerník 381
Věstník katolického duchovenstva 772
Přítomnost 342
Venkov 1528
Studentský časopis 102
Československý zemědělec 127
Český učitel 225
Čech 3022
Posel záhrobní 455

"Sure" citations:
Total number of citations: 1239
Moravský hospodář 0
Polední list 17
Moravský večerník 35
Věstník katolického duchovenstva 219
Přítomnost 53
Venkov 180
Students

### Transfer data for chart.js

In [119]:
def transfer_flourish_to_chartjs(input_chart_csv:str, color_palette_rgb:list, input_metadata_csv='SBL_map_of_meanings_treshold_1_points.csv'):
    """ This function transfers input statistics into a form suitable for javascript chart plotting with chart.js """

    input_df = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', input_chart_csv), quotechar='"', delimiter=',', encoding='utf-8', index_col=0)
    metadata_df = pd.read_csv(os.path.join(RESULTS_PATH, 'statistics', input_metadata_csv), quotechar='"', delimiter=',', encoding='utf-8', index_col=0)

    input_df = input_df.sort_values(by='year')
    input_df = input_df.transpose()
    input_df.columns = input_df.iloc[0]
    input_df = input_df.drop(input_df.index[0])

    print('[{')
    for i, row_id in enumerate(input_df.index):
        row_dict = input_df.loc[row_id].to_dict()
        citations_in_years = []
        for year in row_dict:
            if type(year) == int:
                citations_in_years.append(row_dict[year])

        verse_color = str(color_palette_rgb[i])[1:-1]

        print(f"\t\tverseId: '{row_id}',")
        print(f"\t\tcitations: {citations_in_years},")
        print(f"\t\tbackgroundColor: 'rgb({verse_color}, 0.6)',")
        print(f"\t\tborderColor: 'rgb({verse_color}, 0.9)',")
        print(f"\t\thoverBackgroundColor: 'rgb({verse_color}, 1)',")
        print(f"\t\tverseText: '{metadata_df.loc[row_id]['Verse text']}'")
        if i+1 == len(input_df.index):
            print('\t}')
        else:
            print('\t},{')
        
    print('];')

##### colour palletes
const rgb_colours_blues = [(204, 232, 238), (102, 185, 203), (0, 139, 168), (0, 83, 101), (0, 28, 34)]
const rgb_colours_green = [(179, 229, 212), (77, 194, 156), (0, 168, 113), (0, 118, 79), (0, 50, 34)]
const rgb_colours_orange = [(247, 222, 191), (235, 177, 107), (227, 144, 43), (159, 101, 30), (68, 43, 13)]
const rgb_colours_red = [(247, 194, 191), (235, 113, 107), (227, 52, 43), (159, 36, 30), (68, 16, 13)]

const rgb_extra_colour_pt1 = [(33, 58, 63), (73, 99, 104), (0, 137, 112), (127, 209, 174), (184, 206, 170), (204, 232, 238), (233, 223, 247)]
const rgb_extra_colour_pt2 = [(178, 168, 191), (131, 79, 98), (251, 179, 205), (252, 221, 203), (191, 90, 0), (131, 40, 0), (173, 137, 115)]
const rgb_extra_colour_pt3 = [(250, 188, 42), (255, 202, 177), (243, 141, 104), (238, 108, 77), (247, 111, 142), (242, 186, 201), (127, 216, 190), (161, 252, 223), (59, 82, 73), (81, 152, 114)]

In [131]:
# rgb_colours_blues = [[204, 232, 238], [102, 185, 203], [0, 139, 168], [0, 83, 101], [0, 28, 34]]
# rgb_colours_green = [[179, 229, 212], [77, 194, 156], [0, 168, 113], [0, 118, 79], [0, 50, 34]]
# rgb_colours_orange = [[247, 222, 191], [235, 177, 107], [227, 144, 43], [159, 101, 30], [68, 43, 13]]
# rgb_colours_red = [[247, 194, 191], [235, 113, 107], [227, 52, 43], [159, 36, 30], [68, 16, 13]]

rgb_colours_blues = [[204, 232, 238], [102, 185, 203], [0, 139, 168], [0, 83, 101]]
rgb_colours_green = [[179, 229, 212], [77, 194, 156], [0, 168, 113], [0, 118, 79]]
rgb_colours_orange = [[247, 222, 191], [235, 177, 107], [227, 144, 43], [159, 101, 30]]
rgb_colours_red = [[247, 194, 191], [235, 113, 107], [227, 52, 43], [159, 36, 30]]

rgb_colours_blues.reverse()
rgb_colours_green.reverse()
rgb_colours_orange.reverse()
rgb_colours_red.reverse()

rgb_extra_colour_pt1 = [[33, 58, 63], [73, 99, 104], [0, 137, 112], [127, 209, 174], [184, 206, 170], [204, 232, 238], [233, 223, 247]]
rgb_extra_colour_pt2 = [[178, 168, 191], [131, 79, 98], [251, 179, 205], [252, 221, 203], [191, 90, 0], [131, 40, 0], [173, 137, 115]]
rgb_extra_colour_pt3 = [[250, 188, 42], [255, 202, 177], [243, 141, 104], [238, 108, 77], [247, 111, 142], [242, 186, 201], [127, 216, 190], [161, 252, 223], [59, 82, 73], [81, 152, 114]]

color_palette_1 = []
for i in range(0, 4):
    color_palette_1.append(rgb_colours_blues[i])
    color_palette_1.append(rgb_colours_green[i])
    color_palette_1.append(rgb_colours_orange[i])
    color_palette_1.append(rgb_colours_red[i])

color_palette_2 = []
for i in range(0, 3):
    color_palette_2.append(rgb_extra_colour_pt1[i])
    color_palette_2.append(rgb_extra_colour_pt2[i])
    color_palette_2.append(rgb_extra_colour_pt3[i])

color_palette_3 = []
for i in range(3, 7):
    color_palette_3.append(rgb_extra_colour_pt1[i])
    color_palette_3.append(rgb_extra_colour_pt2[i])
    color_palette_3.append(rgb_extra_colour_pt3[i])

color_palette_large = []
for i in range(0, 7):
    color_palette_large.append(rgb_extra_colour_pt1[i])
    color_palette_large.append(rgb_extra_colour_pt2[i])
    color_palette_large.append(rgb_extra_colour_pt3[i])

In [115]:
print(color_palette_1[1])

[179, 229, 212]


In [252]:
print('const dataForPlottingTop = ')
transfer_flourish_to_chartjs(input_chart_csv='SBL_verses_in_years_results_stats.csv', color_palette_rgb=color_palette_1)

print('\nconst dataForPlottingTopEvangelia = ')
transfer_flourish_to_chartjs(input_chart_csv='SBL_gospels_books_subset_results_stats.csv', color_palette_rgb=color_palette_1)

print('\nconst dataForPlottingTopNZ = ')
transfer_flourish_to_chartjs(input_chart_csv='SBL_new_testament_no_gospels_books_subset_results_stats.csv', color_palette_rgb=color_palette_1)

print('\nconst dataForPlottingTopSZ = ')
transfer_flourish_to_chartjs(input_chart_csv='SBL_old_testament_books_subset_results_stats.csv', color_palette_rgb=color_palette_1)

print('\nconst dataForPlottingTopDesatero = ')
transfer_flourish_to_chartjs(input_chart_csv='SBL_ten_commandments_verses_subset_results_stats.csv', color_palette_rgb=color_palette_1)

const dataForPlottingTop = 
[{
		verseId: 'Matt 5:38',
		citations: [2, 7, 5, 5, 8, 7, 7, 7, 13, 2, 5, 1, 4, 5, 2],
		backgroundColor: 'rgb(0, 83, 101, 0.6)',
		borderColor: 'rgb(0, 83, 101, 0.9)',
		hoverBackgroundColor: 'rgb(0, 83, 101, 1)',
		verseText: 'Slyšeli jste, že bylo řečeno: „Oko za oko, zub za zub.“'
	},{
		verseId: 'Matt 3:3/John 1:23/Mark 1:3/Luke 3:4/Isa 40:3',
		citations: [4, 9, 11, 6, 5, 6, 9, 1, 4, 0, 3, 1, 1, 0, 5],
		backgroundColor: 'rgb(0, 118, 79, 0.6)',
		borderColor: 'rgb(0, 118, 79, 0.9)',
		hoverBackgroundColor: 'rgb(0, 118, 79, 1)',
		verseText: 'jakož psáno jest v knize řečí proroka Isaiáše: „Hlas volajícího na poušti: Připravte cestu Páně, přímé čiňte stezky jeho;'
	},{
		verseId: 'Luke 2:14',
		citations: [7, 10, 11, 8, 4, 12, 8, 9, 11, 4, 2, 1, 5, 2, 3],
		backgroundColor: 'rgb(159, 101, 30, 0.6)',
		borderColor: 'rgb(159, 101, 30, 0.9)',
		hoverBackgroundColor: 'rgb(159, 101, 30, 1)',
		verseText: 'Sláva na výsostech Bohu, a na zemi pokoj, lidem dobrá