# STatistical tools for BiblicalIntertextuality

This jupyter notebook contains functions that allows you to process statistics out of the data corpus (both journals and the Bible) and out of the results file.

In [17]:
import pandas as pd
import json
import os
import biblical_intertextuality_package as bip
from collections import defaultdict
import numpy as np
import joblib

from biblical_intertextuality_package import split_verse

In [5]:
ROOT_PATH = os.getcwd()

BIBLES_PATH = os.path.join(ROOT_PATH, 'Bible_files')
DATASETS_PATH = os.path.join(ROOT_PATH, 'datasets')
DICTS_PATH = os.path.join(ROOT_PATH, 'dictionaries')
CORPUS_PATH = os.path.join(ROOT_PATH, 'corpuses')
ALL_JSONS_PATH = os.path.join(ROOT_PATH, 'query_jsons')

JOURNAL_FULLDATA_PATH = os.path.join(ROOT_PATH, 'journals_fulldata.joblib')

# RESULTS_PATH = os.path.join(ROOT_PATH, 'results')
RESULTS_PATH = os.path.join(ROOT_PATH, 'PUBLIC_RESULTS')
BATCHES_FILE_PATH = os.path.join(ROOT_PATH, 'batches.csv')
BATCH_RESULTS_FILE_PATH = os.path.join(RESULTS_PATH, 'batch_results.csv')

STOP_WORDS_PATH = os.path.join(ROOT_PATH, 'stop_words.txt')
STOP_SUBVERSES_PATH = os.path.join(ROOT_PATH, 'stop_subverses_21.txt')
EXCLUSIVES_PATH = os.path.join(ROOT_PATH, 'exclusives.txt')

EVALUATION_STOP_SUBVERSES_PATH = os.path.join(ROOT_PATH, 'evaluation_stop_subverses_21.txt')
FULL_HIT_NEEDED_SUBS_PATH = os.path.join(ROOT_PATH, '100_hit_needed_subs_21.txt')

## The Bible statistics
Following functions return statistics about Bible files as stored in Bible_files directory. Be aware that we as the developers have an additional translation at our disposal ('Bible Svatováclavská') so your statistics will be probably different.

- number of translations at our disposal: 5
    - one is both Old and New Testament ("Bible Kralická")
    - translation of Jan Hejčl: Old Testament + deuterocanonical books
    - "Bible of the Saint Venceslas": mix of Old and New Testament, not complete (even some individual verses are missing)
    - translation of Ladislav Sýkora: New Testament
    - translation of František Žilka: New Testament

In [4]:
translation_names = {'BKR': 'Bible Kralická', 'BSV': 'Bible Svatováclavská', 'HEJCL': 'Jan Hejčl', 'SYK': 'Ladislav Sýkora', 'ZP': 'František Žilka'}

In [15]:
def get_book_filename_data(bible_file_name:str):
    bible_file_name = bible_file_name[:-4]
    filename_parts = bible_file_name.split('_')

    translation = filename_parts[1]
    book_name = filename_parts[2]

    return translation, book_name


def analise_the_bible():
    """
    This function analyses all data in the Bible_files directory. You can change split_verse settings if you have created your dataset with different settings.
    
    It prints following statistics:
    - number of translations and their abbreviated names
    - number of books (in aggregation and for each translation separately)
    - number of verses (by verse_id, all in aggregation, and for each translation separately)
    - number of subverses (in aggregation and for each translation separately)

    And it saves csv file with statistics of books, verses and subverses for each translation (into RESULTS_PATH (by defaut now PUBLIC_RESULTS directory) --> statistics).
    """
    all_bible_files = os.listdir(BIBLES_PATH)

    translations = []
    
    books_names = []
    books_aggregated = 0
    books_per_trsl = defaultdict(int)

    verse_ids = []
    verses_aggregated = 0
    verses_per_trsl = defaultdict(int)

    subverses_aggregated = 0
    subverses_per_trsl = defaultdict(int)

    for bible_file in all_bible_files:
        translation, book_name = get_book_filename_data(bible_file)
        
        translations.append(translation)

        books_names.append(book_name)
        books_aggregated += 1
        books_per_trsl[translation] += 1

        with open(os.path.join(BIBLES_PATH, bible_file), 'r', encoding='utf-8') as b_file:
            data = b_file.read()
            verses = eval(data)

            for verse_id in verses:
                verse_ids.append(verse_id)
                verses_aggregated += 1
                verses_per_trsl[translation] += 1

                subverses = split_verse(verses[verse_id])

                subverses_aggregated += len(subverses)
                subverses_per_trsl[translation] += len(subverses)

    translations = list(set(translations))

    print(translations)
    print('Numebr of translations:', len(translations))

    print()

    books_names = list(set(books_names))
    print('Number of available books:', len(books_names))
    print('Number of books accross translations:', books_aggregated)
    for trsl in books_per_trsl:
        print('Number of books in', trsl, 'is', books_per_trsl[trsl])

    print()

    verse_ids = list(set(verse_ids))
    print('Number of available verses:', len(verse_ids))
    print('Number of verses accross translations:', verses_aggregated)
    for trsl in verses_per_trsl:
        print('Number of verses in', trsl, 'is', verses_per_trsl[trsl])

    print()
    
    print('Total number of subverses:', subverses_aggregated)
    for trsl in subverses_per_trsl:
        print('Number of subverses in', trsl, 'is', subverses_per_trsl[trsl])

    df_dict = {'translation': [], 'books': [], 'verses': [], 'subverses': []}
    for trsl in translations:
        df_dict['translation'].append(trsl)
        df_dict['books'].append(books_per_trsl[trsl])
        df_dict['verses'].append(verses_per_trsl[trsl])
        df_dict['subverses'].append(subverses_per_trsl[trsl])

    
    bible_stats_df = pd.DataFrame(df_dict)
    bible_stats_df.to_csv(os.path.join(RESULTS_PATH, 'statistics', 'bible_stats.csv'))


In [16]:
analise_the_bible()

['BKR', 'SYK', 'HEJCL', 'BSV', 'ZP']
Numebr of translations: 5

Number of available books: 75
Number of books accross translations: 201
Number of books in BKR is 65
Number of books in BSV is 37
Number of books in HEJCL is 47
Number of books in SYK is 26
Number of books in ZP is 26

Number of available verses: 38190
Number of verses accross translations: 89080
Number of verses in BKR is 30767
Number of verses in BSV is 15326
Number of verses in HEJCL is 27896
Number of verses in SYK is 7541
Number of verses in ZP is 7550

Total number of subverses: 327972
Number of subverses in BKR is 119388
Number of subverses in BSV is 55660
Number of subverses in HEJCL is 103523
Number of subverses in SYK is 24786
Number of subverses in ZP is 24615


## Journals data statistics

The following section contains functions that facilitace journals statistics. Since the sournals data are not availabe within the GitHub repository, these functions are pretty much useless for you, unles you wish to analyse your own datasets.

- NOTE: this process also revelas some mistakes in the dataset. For example Čech no. 26 from 1935 has no marked date in our dataset and [online](https://kramerius5.nkp.cz/view/uuid:334149b0-877c-11e6-8aeb-5ef3fc9ae867) is marek as issued in 1927. In general, such mistakes are ignored because statistically insignificant, but the researches should be avere of these problems (and may repair such mistakes in their datasets).

In [27]:
def analyse_journals():
    """
    This function analyses the available journal files (based on the 'journals_fulldata.joblib' file)

    It prints following data:
    - number of analysed journals
    - number of years that each journal covers
    - number of issues per journal
    - number of pages per journal
    - number of charaters per journal
    - average characters per page per journal
    """

    journals_fulldata_dict = joblib.load(JOURNAL_FULLDATA_PATH)

    journals = []
    years_per_yournal = defaultdict(list)
    issues_per_journal = defaultdict(list)
    pages_per_journal = defaultdict(int)
    characters_per_journal = defaultdict(int)

    for uuid_file in journals_fulldata_dict:
        uuid_file_data = journals_fulldata_dict[uuid_file]
        
        journal = uuid_file_data['journal']
        if journal not in journals:
            journals.append(journal)

        issue_date = uuid_file_data['issue_date']
        try:
            issue_year = issue_date.split('.')[2]
        except IndexError:
            print(journal, issue_date)
            issue_year = issue_date


        if issue_year not in years_per_yournal[journal]:
            years_per_yournal[journal].append(issue_year)

        uuid = uuid_file_data['issue_uuid']
        if uuid not in issues_per_journal[journal]:
            issues_per_journal[journal].append(uuid)

        pages_per_journal[journal] += 1

        characters_per_journal[journal] += len(uuid_file_data['text'])

    print('Number of analysed journals:', len(journals))

    print()

    for journal in journals:
        print(journal)
        print(f'\tYears in journal ({len(years_per_yournal[journal])}):', years_per_yournal[journal])
        print('\tIssues in journal:', len(issues_per_journal[journal]))
        print('\tPages in journal:', pages_per_journal[journal])
        print('\tCharacter in journal:', characters_per_journal[journal])
        print('\tAverage characters per page in journal:', characters_per_journal[journal]/pages_per_journal[journal])

In [28]:
analyse_journals()

Československý zemědělec 1925
Československý zemědělec 1925
Československý zemědělec 1925
Československý zemědělec 1925
Československý zemědělec 1925
Československý zemědělec 1925
Československý zemědělec 1925
Československý zemědělec 1925
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1934
Československý zemědělec 1937
Československý zemědělec 1937
Československý zemědělec 1937
Československý zemědělec 1937
Československý zemědělec 1937
Československý zemědělec 1937
Československý zemědělec 1930
Československý zemědělec 1930
Československý zemědělec 1930
Československý zemědělec 1930
Československý zemědělec 1930
Československý zemědělec 1930
Československý zemědělec 1930
Československý zemědělec 1930
Československý zemědělec 1932
Českoslove

## Results statistics

The following section contains functions that facilitate results stiatistics

### Runtime statistics
Statistics of runtimes - based on batches.csv file (the runtimes are always averages per batch, not per single documents!)