# STatistical tools for BiblicalIntertextuality

This jupyter notebook contains functions that allows you to process statistics out of the data corpus (both journals and the Bible) and out of the results file.

In [17]:
import pandas as pd
import json
import os
import biblical_intertextuality_package as bip
from collections import defaultdict
import numpy as np
import joblib

from biblical_intertextuality_package import split_verse

In [5]:
ROOT_PATH = os.getcwd()

BIBLES_PATH = os.path.join(ROOT_PATH, 'Bible_files')
DATASETS_PATH = os.path.join(ROOT_PATH, 'datasets')
DICTS_PATH = os.path.join(ROOT_PATH, 'dictionaries')
CORPUS_PATH = os.path.join(ROOT_PATH, 'corpuses')
ALL_JSONS_PATH = os.path.join(ROOT_PATH, 'query_jsons')

JOURNAL_FULLDATA_PATH = os.path.join(ROOT_PATH, 'journals_fulldata.joblib')

# RESULTS_PATH = os.path.join(ROOT_PATH, 'results')
RESULTS_PATH = os.path.join(ROOT_PATH, 'PUBLIC_RESULTS')
BATCHES_FILE_PATH = os.path.join(ROOT_PATH, 'batches.csv')
BATCH_RESULTS_FILE_PATH = os.path.join(RESULTS_PATH, 'batch_results.csv')

STOP_WORDS_PATH = os.path.join(ROOT_PATH, 'stop_words.txt')
STOP_SUBVERSES_PATH = os.path.join(ROOT_PATH, 'stop_subverses_21.txt')
EXCLUSIVES_PATH = os.path.join(ROOT_PATH, 'exclusives.txt')

EVALUATION_STOP_SUBVERSES_PATH = os.path.join(ROOT_PATH, 'evaluation_stop_subverses_21.txt')
FULL_HIT_NEEDED_SUBS_PATH = os.path.join(ROOT_PATH, '100_hit_needed_subs_21.txt')

## The Bible statistics
Following functions return statistics about Bible files as stored in Bible_files directory. Be aware that we as the developers have an additional translation at our disposal ('Bible Svatováclavská') so your statistics will be probably different.

- number of translations at our disposal: 5
    - one is both Old and New Testament ("Bible Kralická")
    - translation of Jan Hejčl: Old Testament + deuterocanonical books
    - "Bible of the Saint Venceslas": mix of Old and New Testament, not complete (even some individual verses are missing)
    - translation of Ladislav Sýkora: New Testament
    - translation of František Žilka: New Testament

In [4]:
translation_names = {'BKR': 'Bible Kralická', 'BSV': 'Bible Svatováclavská', 'HEJCL': 'Jan Hejčl', 'SYK': 'Ladislav Sýkora', 'ZP': 'František Žilka'}

In [15]:
def get_book_filename_data(bible_file_name:str):
    bible_file_name = bible_file_name[:-4]
    filename_parts = bible_file_name.split('_')

    translation = filename_parts[1]
    book_name = filename_parts[2]

    return translation, book_name


def analise_the_bible():
    """
    This function analyses all data in the Bible_files directory. You can change split_verse settings if you have created your dataset with different settings.
    
    It prints following statistics:
    - number of translations and their abbreviated names
    - number of books (in aggregation and for each translation separately)
    - number of verses (by verse_id, all in aggregation, and for each translation separately)
    - number of subverses (in aggregation and for each translation separately)

    And it saves csv file with statistics of books, verses and subverses for each translation (into RESULTS_PATH (by defaut now PUBLIC_RESULTS directory) --> statistics).
    """
    all_bible_files = os.listdir(BIBLES_PATH)

    translations = []
    
    books_names = []
    books_aggregated = 0
    books_per_trsl = defaultdict(int)

    verse_ids = []
    verses_aggregated = 0
    verses_per_trsl = defaultdict(int)

    subverses_aggregated = 0
    subverses_per_trsl = defaultdict(int)

    for bible_file in all_bible_files:
        translation, book_name = get_book_filename_data(bible_file)
        
        translations.append(translation)

        books_names.append(book_name)
        books_aggregated += 1
        books_per_trsl[translation] += 1

        with open(os.path.join(BIBLES_PATH, bible_file), 'r', encoding='utf-8') as b_file:
            data = b_file.read()
            verses = eval(data)

            for verse_id in verses:
                verse_ids.append(verse_id)
                verses_aggregated += 1
                verses_per_trsl[translation] += 1

                subverses = split_verse(verses[verse_id])

                subverses_aggregated += len(subverses)
                subverses_per_trsl[translation] += len(subverses)

    translations = list(set(translations))

    print(translations)
    print('Numebr of translations:', len(translations))

    print()

    books_names = list(set(books_names))
    print('Number of available books:', len(books_names))
    print('Number of books accross translations:', books_aggregated)
    for trsl in books_per_trsl:
        print('Number of books in', trsl, 'is', books_per_trsl[trsl])

    print()

    verse_ids = list(set(verse_ids))
    print('Number of available verses:', len(verse_ids))
    print('Number of verses accross translations:', verses_aggregated)
    for trsl in verses_per_trsl:
        print('Number of verses in', trsl, 'is', verses_per_trsl[trsl])

    print()
    
    print('Total number of subverses:', subverses_aggregated)
    for trsl in subverses_per_trsl:
        print('Number of subverses in', trsl, 'is', subverses_per_trsl[trsl])

    df_dict = {'translation': [], 'books': [], 'verses': [], 'subverses': []}
    for trsl in translations:
        df_dict['translation'].append(trsl)
        df_dict['books'].append(books_per_trsl[trsl])
        df_dict['verses'].append(verses_per_trsl[trsl])
        df_dict['subverses'].append(subverses_per_trsl[trsl])

    
    bible_stats_df = pd.DataFrame(df_dict)
    bible_stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'bible_stats.csv'))


In [16]:
analise_the_bible()

['BKR', 'SYK', 'HEJCL', 'BSV', 'ZP']
Numebr of translations: 5

Number of available books: 75
Number of books accross translations: 201
Number of books in BKR is 65
Number of books in BSV is 37
Number of books in HEJCL is 47
Number of books in SYK is 26
Number of books in ZP is 26

Number of available verses: 38190
Number of verses accross translations: 89080
Number of verses in BKR is 30767
Number of verses in BSV is 15326
Number of verses in HEJCL is 27896
Number of verses in SYK is 7541
Number of verses in ZP is 7550

Total number of subverses: 327972
Number of subverses in BKR is 119388
Number of subverses in BSV is 55660
Number of subverses in HEJCL is 103523
Number of subverses in SYK is 24786
Number of subverses in ZP is 24615


## Journals data statistics

The following section contains functions that facilitace journals statistics. Since the sournals data are not availabe within the GitHub repository, these functions are pretty much useless for you, unles you wish to analyse your own datasets.

- NOTE: this process also revelas some mistakes in the dataset. For example Čech no. 26 from 1935 has no marked date in our dataset and [online](https://kramerius5.nkp.cz/view/uuid:334149b0-877c-11e6-8aeb-5ef3fc9ae867) is marek as issued in 1927. In general, such mistakes are ignored because statistically insignificant, but the researches should be avere of these problems (and may repair such mistakes in their datasets).

In [51]:
def analyse_journals():
    """
    This function analyses the available journal files (based on the 'journals_fulldata.joblib' file)

    It prints following data:
    - number of analysed journals
    - number of years that each journal covers
    - number of issues per journal
    - number of pages per journal
    - number of charaters per journal
    - average characters per page per journal
    """
    # NOTE: because we are following only years 1925-1939, we are ignoring those that are outside of this scope... '1937-1938' is added because it fits the profile, too...
    years_to_consider = ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939', '1937-1938']

    journals_fulldata_dict = joblib.load(JOURNAL_FULLDATA_PATH)

    journals = []
    years_per_yournal = defaultdict(list)
    issues_per_journal = defaultdict(list)
    pages_per_journal = defaultdict(int)
    characters_per_journal = defaultdict(int)

    uuids_out_of_the_scope = []
    years_out_of_scope = []
    uuids_without_date = []
    
    for uuid_file in journals_fulldata_dict:
        uuid_file_data = journals_fulldata_dict[uuid_file]
        
        journal = uuid_file_data['journal']
        issue_date = uuid_file_data['issue_date']
        
        if issue_date == '':
            uuids_without_date.append(uuid_file)
            continue
        else:
            date_parts = issue_date.split('.')
            issue_year = date_parts[-1]

            if issue_year in years_to_consider:
                if issue_year not in years_per_yournal[journal]:
                    if issue_year == '1937-1938':
                        continue
                    else:
                        years_per_yournal[journal].append(issue_year)
        
                if journal not in journals:
                    journals.append(journal)                

                uuid = uuid_file_data['issue_uuid']
                if uuid not in issues_per_journal[journal]:
                    issues_per_journal[journal].append(uuid)

                pages_per_journal[journal] += 1

                characters_per_journal[journal] += len(uuid_file_data['text'])
            
            else:
                uuids_out_of_the_scope.append(uuid_file)
                years_out_of_scope.append(issue_year)

    print('Number of analysed journals:', len(journals))

    print()

    for journal in journals:
        years_per_yournal[journal].sort()
        print(journal)
        print(f'\tYears in journal ({len(years_per_yournal[journal])}):', years_per_yournal[journal])
        print('\tIssues in journal:', len(issues_per_journal[journal]))
        print('\tPages in journal:', pages_per_journal[journal])
        print('\tCharacter in journal:', characters_per_journal[journal])
        print('\tAverage characters per page in journal:', characters_per_journal[journal]/pages_per_journal[journal])

    print()

    print('Number of files that do not fit the time range:', len(uuids_out_of_the_scope), set(years_out_of_scope))
    print('Number of files that do not have date in the metadata:', len(uuids_without_date))

    df_dict = {'journal': [], 'years': [], 'issues': [], 'pages': [], 'standard pages': [], 'characters': []}
    for journal in journals:
        df_dict['journal'].append(journal)
        df_dict['years'].append(len(years_per_yournal[journal]))
        df_dict['issues'].append(len(issues_per_journal[journal]))
        df_dict['pages'].append(pages_per_journal[journal])
        df_dict['standard pages'].append(characters_per_journal[journal]/1800)
        df_dict['characters'].append(characters_per_journal[journal])

    
    journals_stats_df = pd.DataFrame(df_dict)
    journals_stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'journals_stats.csv'))

In [52]:
analyse_journals()

Number of analysed journals: 11

Čech
	Years in journal (10): ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934']
	Issues in journal: 2368
	Pages in journal: 15678
	Character in journal: 145508697
	Average characters per page in journal: 9281.075200918485
Československý zemědělec
	Years in journal (15): ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939']
	Issues in journal: 753
	Pages in journal: 9278
	Character in journal: 91147459
	Average characters per page in journal: 9824.04171157577
Český učitel
	Years in journal (15): ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939']
	Issues in journal: 534
	Pages in journal: 11580
	Character in journal: 57198594
	Average characters per page in journal: 4939.429533678756
Moravský hospodář
	Years in journal (15): ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1

## Runtime statistics
Statistics of runtimes - based on batches.csv file (the runtimes are always averages per batch, not per single documents!)

In [56]:
def analyse_runtimes():
    """
    This function analyses runtimes from the batches.csv file.
    """
    df = pd.read_csv(os.path.join(ROOT_PATH, 'batches.csv'), quotechar='"', delimiter=',', encoding='utf-8')

    journals_full_names = ['Moravský hospodář', 'Polední list', 'Moravský večerník', 'Věstník katolického duchovenstva', 'Přítomnost', 'Venkov', 'Studentský časopis', 'Československý zemědělec', 'Český učitel', 'Čech', 'Posel záhrobní']
    journals_filenames = ['moravsky_hospodar', 'poledni_list', 'moravsky_vecernik', 'vestnik_katolickeho_duchovenstva', 'pritomnost', 'venkov', 'studentsky_casopis', 'ceskoslovensky_zemedelec', 'cesky_ucitel', 'cech', 'posel_zahrobni']

    all_times = []
    results = {}
    
    for i, journal in enumerate(journals_filenames):
        subset_dataframe = df[df['journal'] == journal]
        journal_runtimes = defaultdict(int)
        for row_id in subset_dataframe.index:
            runtime = subset_dataframe.loc[row_id]['runtime']
            if runtime >= 15:
                # ignoring wierdly long runtimes (probably due to computer in sleep mode...)
                continue
            else:
                journal_runtimes[round(runtime, 1)] += 1
                all_times.append(round(runtime, 1))

        print(journals_full_names[i], journal_runtimes)
        results[journals_full_names[i]] = journal_runtimes
    
    all_times_full = [round(i, 1) for i in np.arange(0, 10, 0.1)]

    output_dict = {}
    out_idx = 0
    
    for res in results:
        journal_data = {}
        journal_data['journal'] = res
        for timestamp in all_times_full:
            try:
                journal_data[timestamp] = results[res][timestamp]
            except IndexError:
                journal_data[timestamp] = 0
        
        output_dict[out_idx] = journal_data
        out_idx += 1

    output_df = pd.DataFrame.from_dict(output_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'runtimes_stats.csv'), quotechar='"', sep=',', encoding='utf-8')


In [57]:
analyse_runtimes()

Moravský hospodář defaultdict(<class 'int'>, {0.8: 1880, 0.7: 1880, 0.9: 520, 0.6: 1200, 0.5: 160, 1.1: 40, 1.8: 3})
Polední list defaultdict(<class 'int'>, {2.4: 3480, 2.5: 1960, 2.6: 2360, 2.2: 640, 2.3: 1600, 2.7: 360, 2.8: 280, 2.9: 40, 2.1: 240, 1.0: 19})
Moravský večerník defaultdict(<class 'int'>, {1.8: 5480, 2.1: 3600, 2.0: 4480, 1.9: 4280, 1.6: 2160, 1.7: 3000, 2.2: 1960, 2.3: 720, 1.4: 160, 1.5: 1000, 2.4: 673, 1.3: 80, 2.6: 120, 2.7: 40, 2.5: 160})
Věstník katolického duchovenstva defaultdict(<class 'int'>, {1.9: 40, 1.0: 400, 1.3: 120, 1.2: 480, 1.1: 680, 0.9: 200, 1.4: 40, 0.0: 2})
Přítomnost defaultdict(<class 'int'>, {0.9: 40, 1.5: 4080, 1.3: 1800, 1.4: 4396, 1.6: 2600, 1.2: 880, 1.7: 640, 1.8: 320, 2.1: 40, 1.1: 80, 2.8: 40, 1.9: 40, 2.0: 40})
Venkov defaultdict(<class 'int'>, {2.4: 760, 3.6: 920, 3.2: 7480, 3.4: 2880, 2.9: 7760, 2.8: 7240, 3.5: 1440, 3.1: 8960, 3.3: 4480, 3.8: 320, 3.0: 10120, 2.7: 3560, 2.6: 2080, 2.5: 800, 2.2: 400, 2.3: 240, 3.7: 280, 4.3: 120, 3.9:

## Results statistics

The following section contains functions that facilitate results stiatistics

### Overall statistics for journals

In [63]:
all_journals = ['Moravský hospodář', 'Polední list', 'Moravský večerník', 'Věstník katolického duchovenstva', 'Přítomnost', 'Venkov', 'Studentský časopis', 'Československý zemědělec', 'Český učitel', 'Čech', 'Posel záhrobní']

def ciation_counts_per_journal(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    print('Total number of citations:', len(citations_dataframe))
    
    citations = {}
    
    for journal in all_journals:
        journal_citations_df = citations_dataframe[citations_dataframe['journal'] == journal]
        print(journal, len(journal_citations_df))
        citations[journal] = len(journal_citations_df)

    return citations


def ciation_counts_per_journal_not_drop(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    
    print('Total number of citations:', len(citations_dataframe))
    
    citations = {}
    
    for journal in all_journals:
        journal_citations_df = citations_dataframe[citations_dataframe['journal'] == journal]
        print(journal, len(journal_citations_df))
        citations[journal] = len(journal_citations_df)

    return citations


def sure_ciations_counts_per_journal(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]
    
    print('Total number of citations:', len(citations_dataframe))
    
    citations = {}
    
    for journal in all_journals:
        journal_citations_df = citations_dataframe[citations_dataframe['journal'] == journal]
        print(journal, len(journal_citations_df))
        citations[journal] = len(journal_citations_df) 
    
    return citations


def get_overall_result_stats():
    print('Filtered citations:')
    filtered_citations = ciation_counts_per_journal('FILTERED_UNFILTERED_batch_results.csv')
    print('\nStop subs:')
    after_stop_subs = ciation_counts_per_journal('ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\nMultiple attrs:')
    multiple_attrs = ciation_counts_per_journal_not_drop('MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\n"Sure" citations:')
    sure_cites = sure_ciations_counts_per_journal('FINAL_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    out_df_dict = {'journal': [], 'initial results': [], 'filtered stop-subverses': [], 'filtered multiple attributions': [], '"sure" citations': []}

    for journal in all_journals:
        out_df_dict['journal'].append(journal)
        out_df_dict['initial results'].append(filtered_citations[journal])
        out_df_dict['filtered stop-subverses'].append(after_stop_subs[journal])
        out_df_dict['filtered multiple attributions'].append(multiple_attrs[journal])
        out_df_dict['"sure" citations'].append(sure_cites[journal])

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'journals_results_stats.csv'))   


In [71]:
get_overall_result_stats()

Filtered citations:
Total number of citations: 38004
Moravský hospodář 285
Polední list 2333
Moravský večerník 4406
Věstník katolického duchovenstva 2008
Přítomnost 2204
Venkov 14299
Studentský časopis 505
Československý zemědělec 1151
Český učitel 1109
Čech 8264
Posel záhrobní 1440

Stop subs:
Total number of citations: 12858
Moravský hospodář 11
Polední list 262
Moravský večerník 691
Věstník katolického duchovenstva 1536
Přítomnost 621
Venkov 2577
Studentský časopis 184
Československý zemědělec 219
Český učitel 405
Čech 5298
Posel záhrobní 1054

Multiple attrs:
Total number of citations: 8121
Moravský hospodář 10
Polední list 199
Moravský večerník 497
Věstník katolického duchovenstva 790
Přítomnost 434
Venkov 1975
Studentský časopis 144
Československý zemědělec 151
Český učitel 259
Čech 3175
Posel záhrobní 487

"Sure" citations:
Total number of citations: 1935
Moravský hospodář 4
Polední list 57
Moravský večerník 118
Věstník katolického duchovenstva 220
Přítomnost 136
Venkov 494
Stud

### Citations in years

In [68]:
years_to_plot = ['1925', '1926', '1927', '1928', '1929', '1930', '1931', '1932', '1933', '1934', '1935', '1936', '1937', '1938', '1939']

def ciation_counts_per_year(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        citations[year] += 1

    return citations


def ciation_counts_per_year_not_drop(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        citations[year] += 1

    return citations


def sure_ciations_counts_per_year(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        citations[year] += 1
    
    return citations


def results_in_years():
    print('Filtered citations:')
    filtered_citations = ciation_counts_per_year('FILTERED_UNFILTERED_batch_results.csv')
    print(filtered_citations)
    print('\nStop subs:')
    after_stop_subs = ciation_counts_per_year('ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print(after_stop_subs)
    print('\nMultiple attrs:')
    multiple_attrs = ciation_counts_per_year_not_drop('MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print(multiple_attrs)
    print('\n"Sure" citations:')
    sure_cites = sure_ciations_counts_per_year('FINAL_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print(sure_cites)

    out_df_dict = {'year': [], 'initial results': [], 'filtered stop-subverses': [], 'filtered multiple attributions': [], '"sure" citations': []}

    for year in years_to_plot:
        out_df_dict['year'].append(year)
        out_df_dict['initial results'].append(filtered_citations[year])
        out_df_dict['filtered stop-subverses'].append(after_stop_subs[year])
        out_df_dict['filtered multiple attributions'].append(multiple_attrs[year])
        out_df_dict['"sure" citations'].append(sure_cites[year])

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'years_results_stats.csv'))   

In [70]:
results_in_years()

Filtered citations:
defaultdict(<class 'int'>, {'1927': 3272, '1926': 2684, '1929': 2609, '1928': 2551, '1930': 2899, '1925': 3102, '1934': 2188, '1931': 3126, '1933': 3054, '1932': 2505, '1935': 2023, '1938': 1945, '1937': 2085, '1936': 2067, '1939': 1894})

Stop subs:
defaultdict(<class 'int'>, {'1927': 1376, '1926': 988, '1928': 881, '1930': 935, '1925': 1422, '1934': 886, '1933': 801, '1932': 738, '1929': 915, '1931': 1198, '1935': 613, '1938': 504, '1936': 526, '1937': 553, '1939': 522})

Multiple attrs:
defaultdict(<class 'int'>, {'1927': 836, '1926': 659, '1928': 607, '1930': 584, '1925': 882, '1934': 523, '1933': 552, '1932': 480, '1929': 609, '1931': 679, '1935': 374, '1938': 344, '1936': 325, '1937': 342, '1939': 325})

"Sure" citations:
defaultdict(<class 'int'>, {'1934': 161, '1926': 118, '1932': 110, '1925': 179, '1927': 178, '1933': 158, '1929': 139, '1928': 122, '1931': 140, '1930': 123, '1935': 125, '1938': 98, '1937': 92, '1936': 103, '1939': 89})


### Top citations - full

In [78]:
def ciation_counts_per_verses(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        citations[verse_id] += 1

    return citations

def ciation_counts_per_verses_drop(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        citations[verse_id] += 1

    return citations

def ciation_counts_per_verses_sure(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]

    citations = defaultdict(int)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        citations[verse_id] += 1

    return citations

def analyse_verses():
    print('Filtered citations:')
    filtered_citations = ciation_counts_per_verses('FILTERED_UNFILTERED_batch_results.csv')
    print('\nStop subs:')
    after_stop_subs = ciation_counts_per_verses('ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\nMultiple attrs:')
    multiple_attrs = ciation_counts_per_verses_drop('MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')
    print('\n"Sure" citations:')
    sure_cites = ciation_counts_per_verses_sure('FINAL_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    out_df_dict = {'verse': [], 'initial results': [], 'filtered stop-subverses': [], 'filtered multiple attributions': [], '"sure" citations': []}

    for verse_id in filtered_citations:
        out_df_dict['verse'].append(verse_id)
        try:
            out_df_dict['initial results'].append(filtered_citations[verse_id])
        except:
            out_df_dict['initial results'].append(0)
        try:            
            out_df_dict['filtered stop-subverses'].append(after_stop_subs[verse_id])
        except:
            out_df_dict['filtered stop-subverses'].append(0)
        try:                
            out_df_dict['filtered multiple attributions'].append(multiple_attrs[verse_id])
        except:
            out_df_dict['filtered multiple attributions'].append(0)
        try:                
            out_df_dict['"sure" citations'].append(sure_cites[verse_id])
        except:
            out_df_dict['"sure" citations'].append(0)

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'verse_results_stats.csv'))   

In [79]:
analyse_verses()

Filtered citations:

Stop subs:

Multiple attrs:

"Sure" citations:


In [87]:
def ciation_counts_per_verses_for_top(results_filename:str, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    citations = defaultdict(int)
    verse_texts = defaultdict(list)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        verse_id = row_dict['verse_id']
        verse_text = row_dict['verse_text']
        citations[verse_id] += 1
        if verse_text not in verse_texts[verse_id]:
            verse_texts[verse_id].append(verse_text)

    return citations, verse_texts


def analyse_verses_top():
    """ Here we count only with filtered stop-subs and resolved multiple attributions. """
    citations, verse_texts = ciation_counts_per_verses_for_top('FINAL_MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    citation_counts = []
    for verse_id in citations:
        citation_counts.append(citations[verse_id])

    citation_counts.sort(reverse=True)    

    out_df_dict = {'verse': [], 'citation count': [], 'verse texts': [], 'filter row': []}

    for verse_id in citations:
        if citations[verse_id] in citation_counts[:10]:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('top 10')
        elif citations[verse_id] in citation_counts[10:20]:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('top 20')
        elif citations[verse_id] in citation_counts[20:30]:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('top 30')
        else:
            out_df_dict['verse'].append(verse_id)
            out_df_dict['citation count'].append(citations[verse_id])
            out_df_dict['verse texts'].append(verse_texts[verse_id])
            out_df_dict['filter row'].append('low')


    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'top_verse_results_stats.csv'))   

In [86]:
analyse_verses_top()

### Top citations - to plot in years

In [100]:
def return_top_x_verse_ids(x:int, results_filename:str):
    """ This function returns top x citations from results dataframe. """

    citations, verse_texts = ciation_counts_per_verses_for_top(results_filename)

    citation_counts = []
    for verse_id in citations:
        citation_counts.append(citations[verse_id])

    citation_counts.sort(reverse=True)

    top_verses = []

    for verse_id in citations:
        if citations[verse_id] in citation_counts[:x]:
            top_verses.append(verse_id)

    return top_verses

In [103]:
def ciation_counts_per_verses_in_years(results_filename:str, verese_ids_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)

    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
        if verse_id in verese_ids_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1

    return citations


def ciation_counts_per_verses_in_years_drop(results_filename:str, verese_ids_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    
    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
        if verse_id in verese_ids_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1

    return citations


def ciation_counts_per_verses_in_years_sure(results_filename:str, verese_ids_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]
    citations_dataframe = citations_dataframe[citations_dataframe['CITATION'] == True]

    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
        if verse_id in verese_ids_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1


def analyse_verse_ids_counts_in_years(top_x:int):
    verese_ids_to_plot = return_top_x_verse_ids(x=top_x, results_filename='MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv')

    citations = ciation_counts_per_verses_in_years_drop('MA_DUPS_ST_SUBS_FILTERED_UNFILTERED_batch_results.csv', verese_ids_to_plot)

    out_df_dict = defaultdict(list)

    for year in citations:
        out_df_dict['year'].append(year)
        for verse_id in verese_ids_to_plot:
            try:
                out_df_dict[verse_id].append(citations[year][verse_id])
            except:
                out_df_dict[verse_id].append(0)

    stats_df = pd.DataFrame(out_df_dict)
    stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'verses_in_years_results_stats.csv'))   

In [102]:
analyse_verse_ids_counts_in_years(10)

### Subset statistics


In [108]:
""" Defining subsets books"""
new_testament_books = ['1J', '1K', '1P', '1Te', '1Tm', '2J', '2K', '2P', '2Te', '2Tm', '3J', 'Ef', 'Fm', 'Fp', 'Ga', 'J', 'Jk', 'Ju', 'Ko', 'L', 'Mk', 'Mt', 'R', 'Sk', 'Tit', 'Zd']
old_testament_books = ['1Kr', '1Pa', '1S', '2Kr', '2Pa', '2S', 'Abd', 'Abk', 'Ag', 'Am', 'Da', 'Dt', 'Est', 'Ex', 'Ez', 'Ezd', 'Gn', 'Iz', 'Jb', 'Jl', 'Jon', 'Joz', 'Jr', 'Kaz', 'Lv', 'Mal', 'Mi', 'Na', 'Neh', 'Nu', 'Oz', 'Pis', 'Pl', 'Pr', 'Rt', 'Sd', 'Sf', 'Z', 'Za']
deuterocanoncal_books = ['1Ma', '2Ma', 'Bar', 'Jud', 'Mou', 'Sir', 'Tob', 'Zuz']
gospels = ['Mk', 'Mt', 'L', 'J']

""" Defining subsets verses"""
ten_commandments_verses = ['Ex 20:2', 'Ex 20:3', 'Ex 20:4', 'Ex 20:5', 'Ex 20:6', 'Ex 20:7', 'Ex 20:8', 'Ex 20:9', 'Ex 20:10', 'Ex 20:11', 'Ex 20:12', 'Ex 20:13', 'Ex 20:14', 'Ex 20:15', 'Ex 20:16', 'Ex 20:17', 'Dt 5:6', 'Dt 5:7', 'Dt 5:8', 'Dt 5:9', 'Dt 5:10', 'Dt 5:11', 'Dt 5:12', 'Dt 5:13', 'Dt 5:14', 'Dt 5:15', 'Dt 5:16', 'Dt 5:17', 'Dt 5:18', 'Dt 5:19', 'Dt 5:20', 'Dt 5:21']

In [None]:
def get_book_name(verse_id:str):
    return verse_id.split(' ')[0]

def ciation_counts_per_verses_in_years_subset_books(results_filename:str, books_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
        book_name = get_book_name(verse_id)
    
        if book_name in books_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1

    return citations


def ciation_counts_per_verses_in_years_subset_verses(results_filename:str, verse_ids_to_plot:list, csv_delimiter=';'):
    citations_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=csv_delimiter, encoding='utf-8', index_col=0)
    
    citations_dataframe = citations_dataframe[citations_dataframe['drop?'] == False]

    citations = defaultdict(dict)
    
    for row_id in citations_dataframe.index:
        row_dict = citations_dataframe.loc[row_id].to_dict()
        year = row_dict['date'].split('.')[-1]
        verse_id = row_dict['verse_id']
    
        if verse_id in verse_ids_to_plot:
            try:
                citations[year][verse_id] += 1
            except KeyError:
                citations[year][verse_id] = 1

    return citations