# Evaluation of Results

In [1]:
import run_biblical_intertextuality as bip

import pandas as pd
import os
import Levenshtein
import numpy as np

from collections import defaultdict
from nltk import word_tokenize, sent_tokenize

In [2]:
""" Defining paths. """
ROOT_PATH = os.getcwd()

BIBLES_PATH = os.path.join(ROOT_PATH, 'Bible_files')
QUERY_DOC_PATH = os.path.join(ROOT_PATH, 'query_documents')
DATASETS_PATH = os.path.join(ROOT_PATH, 'datasets')
DICTS_PATH = os.path.join(ROOT_PATH, 'dictionaries')
CORPUS_PATH = os.path.join(ROOT_PATH, 'corpuses')
RESULTS_PATH = os.path.join(ROOT_PATH, 'results')
ALL_JSONS_PATH = os.path.join(ROOT_PATH, 'extracted_query_jsons')

BATCHES_FILE_PATH = os.path.join(ROOT_PATH, 'batches.csv')
BATCH_RESULTS_FILE_PATH = os.path.join(RESULTS_PATH, 'batch_results.csv')

STOP_WORDS_PATH = os.path.join(ROOT_PATH, 'stop_words.txt')
STOP_SUBVERSES_PATH = os.path.join(ROOT_PATH, 'stop_subverses_21.txt')
EXCLUSIVES_PATH = os.path.join(ROOT_PATH, 'exclusives.txt')

In [4]:
bip.split_verse(bip.get_verse_text('BKR', 'Mt 18:11'))

['Přišel zajisté Syn člověka', 'aby spasil to což bylo zahynulo.']

## Working with results.

In [6]:
def load_results(results_filename='batch_results.csv') -> pd.core.frame.DataFrame:
    """ This function loads selected results from the results folder. It is returned as pandas dataframe
    
    :param results_filename: filename of results; 'batch_results.csv' is the default parameter, as this is the default filename of results from the search functions.
    """
    return pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=',', encoding='utf-8')

In [7]:
columns = ['verse_id', 'query_file', 'index_query_part', 'batch_id', 'ngram_size','query_window_len', 'query_overlap', 'ngram_tolerance', 'edit_distance_tolerance']

In [10]:
def get_row_data(dataframe:pd.core.frame.DataFrame, row_id:int):
    """ This function returns search properties of a given row in the results dataframe. """

    verse_id = dataframe.loc[row_id]['verse_id']
    query_file = dataframe.loc[row_id]['query_file']
    index_query_part = dataframe.loc[row_id]['index_query_part']
    batch_id = dataframe.loc[row_id]['batch_id']
    ngram_size = dataframe.loc[row_id]['ngram_size']
    query_window_len = dataframe.loc[row_id]['query_window_len']
    query_overlap = dataframe.loc[row_id]['query_overlap']
    ngram_tolerance = dataframe.loc[row_id]['ngram_tolerance']
    edit_distance_tolerance = dataframe.loc[row_id]['edit_distance_tolerance']

    return verse_id, query_file, index_query_part, batch_id, ngram_size, query_window_len, query_overlap, ngram_tolerance, edit_distance_tolerance

In [11]:
def get_verse_et_idx(dataframe:pd.core.frame.DataFrame, row_id:int):
    """ This function returns search properties of a given row in the results dataframe. """

    verse_id = dataframe.loc[row_id, 'verse_id']
    index_query_part = dataframe.loc[row_id, 'index_query_part']

    return verse_id, index_query_part

In [12]:
def join_overlap(list_of_parts:list, query_index:int) -> str:
    """ This function serves to join two parts of a query into one string (when the citation has been discovered in two consecutive parts of the query document). """
    output = ''

    sentences_in_1 = sent_tokenize(list_of_parts[query_index])
    try:
        sentences_in_2 = sent_tokenize(list_of_parts[query_index+1])
    except IndexError:
        print(sentences_in_1)
        print(list_of_parts[-1])

    for sent_1 in sentences_in_1:
        if sent_1 not in sentences_in_2:
            output += sent_1 + ' '
        else:
            break

    for sent_2 in sentences_in_2:
        output += sent_2 + ' '

    return output.strip()

In [1]:
def fuzzy_string_matching_for_implementation_with_text(subverse_string:str, query_string:str, tolerance=0.85):
    """ 
    Contrary to fuzzy_string_matching_for_implementation(), this function also returns the matched part of the query string and the edit distance of the compared strings. The function is duplicated so as not to speed down the function in the broad search. However, the speed difference has not been tested yet.

    This function is for implementation of typo similarity detection applied to two strings. It returns bool value of match.

    :param subverse_string: string of the biblical subverse we are searching for.
    :param query_string: string in which we are searching for the seubverse_string.
    :param tolerance: how large proportion of the subverse_string must be present in query_string to consider it a match.
    """
    subverse_string = bip.normalize_string(subverse_string)
    subverse_len = len(subverse_string)

    query_string = bip.normalize_string(query_string)
    query_len = len(query_string)

    tolerance = subverse_len * (1-tolerance)

    if subverse_len-tolerance > query_len:
        # If subverse is longer than query string, it is not a match by default
        return False, '', 0
    elif subverse_len-tolerance <= query_len <= subverse_len+tolerance:
        # If subverse is more or les of the same length as query string, just compare them.
        edit_distance = Levenshtein.distance(subverse_string, query_string)
        if edit_distance <= tolerance:
            return True, query_string, edit_distance
    else:
        # Oherwise, compare parts of the query string (always staring with word, so it is quicker; however, some mistakes may be made here.
        word_len_subv = len(word_tokenize(subverse_string))
        words_in_query_string = word_tokenize(query_string)
        word_len_query_string = len(words_in_query_string)

        for i, cycle in enumerate(range(word_len_subv, word_len_query_string+1)):
            gram_str = ' '.join(words_in_query_string[i:(word_len_subv+i)])
            edit_distance = Levenshtein.distance(subverse_string, gram_str)
            if edit_distance <= tolerance:
                return True, gram_str, edit_distance
            else:
                continue
    
    return False, '', 0

In [14]:
""" Define mutually exclusive words in exclusives.txt """
with open(EXCLUSIVES_PATH, 'r', encoding='utf-8') as exclusives_file:
    data = exclusives_file.read()
    words_lines = data.split('\n')

    exclusives_dict = defaultdict(list)
    list_of_exclusives = []

    for line in words_lines:
        word_list = line.split(', ')
        for word_fro in word_list:
            for word_to in word_list:
                if word_fro == word_to:
                    continue
                if word_fro == 'je' and word_to == 'jest':
                    continue
                if word_fro == 'jest' and word_to == 'je':
                    continue
                else:
                    exclusives_dict[bip.normalize_string(word_fro)].append(bip.normalize_string(word_to))
            list_of_exclusives.append(bip.normalize_string(word_fro))

def exclusiveness_test(subverse_string:str, query_string:str) -> bool:
    """
    This function serves to check if the detected string is not false positive based on mutually exclusive words. E.g. naše vs. vaše;, je vs, není etc.
    """
    subverse_string = bip.normalize_string(subverse_string)
    query_string = bip.normalize_string(query_string)

    subverse_words = bip.word_tokenize_no_punctuation(subverse_string)
    query_words = bip.word_tokenize_no_punctuation(query_string)

    for i, word in enumerate(subverse_words):
        if word in list_of_exclusives:
            list_to_ex = exclusives_dict[word]
            try:
                if query_words[i] in list_to_ex:
                    return False
            except:
                continue

    return True

In [15]:
def is_next_result_continuation(results_dataframe, row_id:int) -> bool:
    """ This function checks if the next result is just the continuation of the previous, so as to check them together. """
    try:
        query_filename = results_dataframe.loc[row_id, 'query_file']
        verse_id = results_dataframe.loc[row_id, 'verse_id']
        query_idx = int(results_dataframe.loc[row_id, 'index_query_part'])
        
        next_query_filename = results_dataframe.loc[row_id+1, 'query_file']
        next_verse_id = results_dataframe.loc[row_id+1, 'verse_id']
        next_query_idx = int(results_dataframe.loc[row_id+1, 'index_query_part'])

        if query_filename == next_query_filename and verse_id == next_verse_id and query_idx == next_query_idx-1:
            return True
        else:
            return False
    except:
        return False

In [16]:
def has_multiple_attribution(results_dataframe, row_id:int):
    """ Checks if the same part of a query is associated with multiple verse IDs. """
    query_filename = results_dataframe.loc[row_id, 'query_file']
    verse_id = results_dataframe.loc[row_id, 'verse_id']
    query_idx = results_dataframe.loc[row_id, 'index_query_part']

    other_attributions_df = results_dataframe[results_dataframe['query_file'] == query_filename]
    other_attributions_df = other_attributions_df[other_attributions_df['index_query_part'] == query_idx]

    other_associated_verses = []
    rows_to_skip = []
    for row_id in other_attributions_df.index:
        other_associated_verses.append(other_attributions_df.loc[row_id, 'verse_id'])
        # rows_to_skip.append((query_filename, verse_id, query_idx+1))
        rows_to_skip.append((query_filename, verse_id, query_idx))

    return other_associated_verses, rows_to_skip

In [17]:
def get_match_probability(subverses:list, fuzzy_matched_subs_num:int, exclusive_matched_subs_num: int, matched_subs_chars:int, matched_subs_edit_distance:int) -> str:
    """ Returns string that express match probability. Implemented in check_results() function. """
    matched_characters = (matched_subs_chars-matched_subs_edit_distance)/matched_subs_chars
    matched_subverses_score = fuzzy_matched_subs_num/len(subverses)
    exclusives_match = exclusive_matched_subs_num/fuzzy_matched_subs_num

    if exclusives_match == 1:
        if matched_characters >= 0.9:
            if matched_subverses_score == 1:
                return 'VERSE MATCH'
            else:
                return 'SUBVERSE MATCH'
        else:
            if matched_subverses_score == 1:
                return 'POSSIBLE VERSE MATCH'
            else:
                return 'POSSIBLE SUBVERSE MATCH'

    elif exclusives_match == 0:
        if matched_characters >= 0.9:
            if matched_subverses_score == 1:
                return 'EXCLUSIVES ERROR, VERSE MATCH'
            else:
                return 'EXCLUSIVES ERROR, SUBVERSE MATCH'
        else:
            if matched_subverses_score == 1:
                return 'EXCLUSIVES ERROR, POSSIBLE VERSE MATCH'
            else:
                return 'EXCLUSIVES ERROR, POSSIBLE SUBVERSE MATCH'

    else:
        if matched_characters >= 0.9:
            if matched_subverses_score == 1:
                return 'EXCLUSIVES PROBLEM, VERSE MATCH'
            else:
                return 'EXCLUSIVES PROBLEM, SUBVERSE MATCH'
        else:
            if matched_subverses_score == 1:
                return 'EXCLUSIVES PROBLEM, POSSIBLE VERSE MATCH'
            else:
                return 'EXCLUSIVES PROBLEM, POSSIBLE SUBVERSE MATCH'

In [18]:
def check_for_verse_in_translations(verse_id:str, string_to_check:str, journal:str, issue_date:str, issue_page_num:str, issue_uuid:str, kramerius_url:str) -> list:
    """ This function performs the inner check for a verse in all availiable translations. It is implemented in the check_results() function. """
    possible_citations = []

    for trsl in bip.all_translations:
        verse_text = bip.get_verse_text(trsl, verse_id, print_exceptions=False)
        if verse_text:
            subverses = bip.split_verse(verse_text, tole_len=21)

            fuzzy_matched_subs_num = 0
            fuzzy_matched_subs = []
            matched_subs_edit_distance = 0
            matched_subs_chars = 0
            exclusive_matched_subs_num = 0

            for subverse in subverses:
                # check for every subverse in edit distance
                fuzzy_match, query_match, edit_distance = fuzzy_string_matching_for_implementation_with_text(subverse, query_string=string_to_check, tolerance=0.85)
                if fuzzy_match:
                    fuzzy_matched_subs_num += 1
                    fuzzy_matched_subs.append(subverse)
                    matched_subs_edit_distance += edit_distance
                    matched_subs_chars += len(subverse)

                    # run the exclussiveness test
                    if exclusiveness_test(subverse, query_match):
                        exclusive_matched_subs_num += 1

                else:
                    continue

            if fuzzy_matched_subs_num == 0:
                continue
            else:            
                match_probability = get_match_probability(subverses, fuzzy_matched_subs_num, exclusive_matched_subs_num, matched_subs_chars, matched_subs_edit_distance)

                result_for_trsl = {'verse_id': verse_id, 
                                    'translation': trsl, 
                                    'verse_text': verse_text, 
                                    'matched_subvesrses': fuzzy_matched_subs, 
                                    'query_string': string_to_check, 
                                    'matched_characters': (matched_subs_chars-matched_subs_edit_distance)/matched_subs_chars, 
                                    'matched_subverses_score': fuzzy_matched_subs_num/len(subverses),
                                    'exclusives_match': exclusive_matched_subs_num/fuzzy_matched_subs_num,
                                    'multiple_attribution': 'No',
                                    'match_probability': match_probability,
                                    'journal': journal, 
                                    'date': issue_date, 
                                    'page_num': issue_page_num, 
                                    'uuid': issue_uuid, 
                                    'kramerius_url': kramerius_url}
            
                possible_citations.append(result_for_trsl)

    return possible_citations

In [19]:
def select_best_citations(possible_citations:list) -> list:
    """ This function selects the best match from the given list based on its score. If more verse IDs/translations score the same, all are counted in. """
    # TODO: Některé citace jsou vyřazeny zbytečně - protože v jedné query sekvenci může být více citací!!! Takže asi spíše vyřazovat jen ty co mají hodně nízké skóre, exclusive error... nebo je možné nějak zjišťovat, jestli se ty citace nacházejí v jiných částech query dokumentu, ale to je možná zbytečně problematické ... nejlepší bude asi zvěrečná ruční kontrola!
    
    # different match probabilities are differently valued
    match_probability_scores ={'VERSE MATCH': 100,
                               'POSSIBLE VERSE MATCH': 90,
                               'SUBVERSE MATCH': 90,                          
                               'POSSIBLE SUBVERSE MATCH': 80,
                               'EXCLUSIVES PROBLEM, VERSE MATCH': 85, 
                               'EXCLUSIVES PROBLEM, SUBVERSE MATCH': 70,
                               'EXCLUSIVES PROBLEM, POSSIBLE VERSE MATCH': 70,
                               'EXCLUSIVES PROBLEM, POSSIBLE SUBVERSE MATCH': 60,
                               'EXCLUSIVES ERROR, VERSE MATCH': 60,
                               'EXCLUSIVES ERROR, SUBVERSE MATCH': 30,
                               'EXCLUSIVES ERROR, POSSIBLE VERSE MATCH': 10,
                               'EXCLUSIVES ERROR, POSSIBLE SUBVERSE MATCH': 0}
    
    scores = []
    for pos_cit in possible_citations:
        characters_score = pos_cit['matched_characters']*50
        subverses_score = pos_cit['matched_subverses_score']*50
        exclusive_score = pos_cit['exclusives_match']*20
        probability_score = match_probability_scores[pos_cit['match_probability']]

        scores.append(characters_score + subverses_score + exclusive_score + probability_score)

    top_score = max(scores)

    best_citations = []
    for i, score in enumerate(scores):
        if score == top_score:
            if possible_citations[i] not in best_citations:
                best_citations.append(possible_citations[i])

    if len(best_citations) > 1:
        for bc in best_citations:
            bc['multiple_attribution'] = 'Yes'
    
    return best_citations

In [20]:
def load_metadata_from_json(json_path:str):
    """ This function loads all needed data from selected JSON file. """
    data = bip.load_json_data(json_path)
    journal = data['journal']
    issue_date = data['date']
    issue_page = data['page_num']
    issue_uuid = data['issue_uuid']
    kramerius_url = f'https://kramerius5.nkp.cz/view/uuid:{issue_uuid}'
    full_query_string = data['text']

    return journal, issue_date, issue_page, issue_uuid, kramerius_url, full_query_string

In [21]:
def get_path_to_json_file(filename:str):
    """ This function returns the path to a specific json file based the filename. It uses the batches.csv file for this."""
    batches_df = pd.read_csv(BATCHES_FILE_PATH, quotechar='"', delimiter=',', encoding='utf-8')

    subset_df = batches_df.loc[batches_df['json_file'] == filename]

    path_to_json = os.path.join(ALL_JSONS_PATH, subset_df.iloc[0]['journal'], filename)

    return path_to_json



In [22]:
# TODO: DROP THIS? IT WILL NOT BE USED

final_dataframe_column_names = ['verse_id', 'translation', 'verse_text', 'matched_subvesrses', 'query_string', 'matched_characters', 'matched_subverses_score', 'exclusives_match',  'multiple_attribution', 'match_probability', 'journal', 'date', 'page_num', 'uuid', 'kramerius_url']

def check_results(results_filename='batch_results.csv'):
    """ This functions applies further checks on the preliminary results. """
    # Load results:
    results_dataframe = load_results(results_filename)

    # Remove duplicate rows from the result dataframe
    print('Original size of the results dataframe:', len(results_dataframe))
    results_dataframe.drop_duplicates(subset=['verse_id', 'query_file', 'index_query_part'], keep='first', inplace=True)
    print('Size of the results dataframe after droping duplicates:', len(results_dataframe))

    # Create (empty) final results dataframe:
    final_results = pd.DataFrame(columns=final_dataframe_column_names)

    rows_to_skip = []
    for row_id in results_dataframe.index:
        verse_id, query_filename, query_idx, batch_id, ngram_size, query_window_len, query_overlap, ngram_tolerance, edit_distance_tolerance = get_row_data(dataframe=results_dataframe, row_id=row_id)
  
        if (query_filename, verse_id, query_idx) in rows_to_skip:
            continue
        else:
            # If the file is differnet than the previous_file, we need to load the data
            json_path = get_path_to_json_file(query_filename)

            journal, issue_date, issue_page, issue_uuid, kramerius_url, full_query_string = load_metadata_from_json(json_path=json_path)
            query_parts = bip.split_query(full_query_string, window_len=query_window_len, overlap=query_overlap)
            
            if is_next_result_continuation(results_dataframe, row_id):
                # Check if the next result is just a continuation of the current one and if so, join them to one.
                string_to_check = join_overlap(query_parts, query_idx)
                rows_to_skip.append((query_filename, verse_id, query_idx+1))
            else:
                string_to_check = query_parts[query_idx]

            associated_verses, new_rows_to_skip = has_multiple_attribution(results_dataframe, row_id)

            rows_to_skip.extend(new_rows_to_skip)
            possible_citations = []
            for asoc_verse_id in associated_verses:
                asoc_possible_citations = check_for_verse_in_translations(asoc_verse_id, string_to_check, journal, issue_date, issue_page, issue_uuid, kramerius_url)
                possible_citations.extend(asoc_possible_citations)

            best_citations = select_best_citations(possible_citations)

            for bc in best_citations:
                final_results = final_results.append(bc, ignore_index=True)

    # Filter duplicates in df (same citations in one page):
    final_results = final_results.drop_duplicates(subset=['verse_id', 'translation', 'matched_characters', 'matched_subverses_score', 'page_num', 'uuid'])
    
    # Save filtered_results:
    output_filename = f'FILTERED_{results_filename}'
    final_results.to_csv(os.path.join(RESULTS_PATH, output_filename), sep=';', quotechar='"', encoding='utf-8')        

In [23]:
def select_attributions_to_json(dataframe:pd.core.frame.DataFrame, query_file:str):
    """ This function selects all attributions to a given JSON file. 
    
    It returns: dataframe of all of the results, row_ids to skip
    """
    subset_dataframe = dataframe[dataframe['query_file'] == query_file]

    # If the subset dataframe contains only one result, return it and empty skips.
    if len(subset_dataframe) == 1:
        verse_id, index_query_part = get_verse_et_idx(subset_dataframe, subset_dataframe.index[0])
        attributed_verses = {verse_id: [index_query_part]}
        return attributed_verses, []

    # If the subset dataframe contains more rows, check if further.
    else:
        row_ids_to_skip = subset_dataframe.index
        attributed_verses = defaultdict(list)
        for row_id in row_ids_to_skip:
            verse_id, index_query_part = get_verse_et_idx(dataframe=subset_dataframe, row_id=row_id)
            attributed_verses[verse_id].append(index_query_part)

        return attributed_verses, row_ids_to_skip


In [24]:
def check_for_verse(verse_id:str, string_to_check:str) -> dict:
    """ This function performs the inner check for a verse in all availiable translations. It is implemented in the check_results() function. """
    possible_citations = []

    for trsl in bip.all_translations:
        verse_text = bip.get_verse_text(trsl, verse_id, print_exceptions=False)
        if verse_text:
            subverses = bip.split_verse(verse_text, tole_len=21)

            fuzzy_matched_subs_num = 0
            fuzzy_matched_subs = []
            matched_subs_edit_distance = 0
            matched_subs_chars = 0
            exclusive_matched_subs_num = 0

            for subverse in subverses:
                # check for every subverse in edit distance
                fuzzy_match, query_match, edit_distance = fuzzy_string_matching_for_implementation_with_text(subverse, query_string=string_to_check, tolerance=0.85)
                if fuzzy_match:
                    fuzzy_matched_subs_num += 1
                    fuzzy_matched_subs.append(subverse)
                    matched_subs_edit_distance += edit_distance
                    matched_subs_chars += len(subverse)

                    # run the exclussiveness test
                    if exclusiveness_test(subverse, query_match):
                        exclusive_matched_subs_num += 1

                else:
                    continue

            if fuzzy_matched_subs_num == 0:
                continue
            else:
                matched_characters = (matched_subs_chars-matched_subs_edit_distance)/matched_subs_chars
                matched_subverses_score = fuzzy_matched_subs_num/len(subverses)

                match_probability = matched_characters*matched_subverses_score

                result_for_trsl = {'verse_id': verse_id,
                                    'verse_text': verse_text, 
                                    'matched_subverses': fuzzy_matched_subs, 
                                    'query_string': string_to_check, 
                                    'matched_characters': (matched_subs_chars-matched_subs_edit_distance)/matched_subs_chars, 
                                    'matched_subverses_score': fuzzy_matched_subs_num/len(subverses),
                                    'exclusives_match': exclusive_matched_subs_num/fuzzy_matched_subs_num,
                                    'match_probability': match_probability}
            
                possible_citations.append(result_for_trsl)

    # Now, select the best match (translations as such are not evaluated, just select the best result of all possible results)... in this evaluation, we consider the result with most detected subverses as a match, if same then based on the characters, and finally on the exclusiveness test results.
    matched_subverses_scores = [pc['matched_subverses'] for pc in possible_citations]
    matched_characters_scores = [pc['matched_characters'] for pc in possible_citations]
    exclusiveness_test_scores = [pc['exclusives_match'] for pc in possible_citations]

    # check subverses score results:
    best_subverses_match = max(matched_subverses_scores)
    if matched_subverses_scores.count(best_subverses_match) == 1:
        best_pc_idx = matched_subverses_scores.index(best_subverses_match)
        return possible_citations[best_pc_idx]
    else:
        # check the character scores results:
        idxs = [i for i, score in enumerate(matched_subverses_scores) if score == best_subverses_match]
        best_chars_match = max([matched_characters_scores[i] for i in idxs])
        if matched_characters_scores.count(best_chars_match) == 1:
            best_pc_idx = matched_characters_scores.index(best_chars_match)
            return possible_citations[best_pc_idx]
        else:
            # check exclusiveness test results:
            idxs = [i for i, score in enumerate(matched_characters_scores) if score == best_chars_match]
            best_excl_res = max([exclusiveness_test_scores[i] for i in idxs])
            best_pc_idx = exclusiveness_test_scores.index(best_excl_res)
            return possible_citations[best_pc_idx]

# TODO: TUTO FCI ověřit na R 13:1 a na uuid 0ee796e0-897b-11e6-84e2-005056827e51 (page 9) ... zdá se mi, že to detekovalo slabší verzi verše!!!

In [25]:
def evaluate_attributions_in_doc(attributed_verses:dict, query_file:str, query_window_len:int, query_overlap:int) -> list:
    """ This function evaluates attributed verses, supposedly detected in a single JSON file. """
    # Load data from JSON file:
    path_to_json = get_path_to_json_file(query_file)
    journal, issue_date, issue_page, issue_uuid, kramerius_url, full_query_string = load_metadata_from_json(json_path=path_to_json)

    query_parts = bip.split_query(full_query_string, window_len=query_window_len, overlap=query_overlap)
    
    results_of_attributions = []
  
    for verse_id in attributed_verses:
        attributed_idxs = attributed_verses[verse_id]
        if len(attributed_idxs) == 1:
            string_to_check = query_parts[attributed_idxs[0]]
            possible_citation = check_for_verse(verse_id=verse_id, string_to_check=string_to_check)
            results_of_attributions.append(possible_citation)
            
        else:
            skip = False
            for i, q_idx in enumerate(attributed_idxs):
                if not skip:
                    try:
                        if attributed_idxs[i+1] == q_idx+1:
                            # checking if the next part is a joined sequence
                            skip = True
                            string_to_check = join_overlap(query_parts, q_idx)
                            possible_citation = check_for_verse(verse_id=verse_id, string_to_check=string_to_check)
                            results_of_attributions.append(possible_citation)
                        else:
                            string_to_check = query_parts[attributed_idxs[0]]
                            possible_citation = check_for_verse(verse_id=verse_id, string_to_check=string_to_check)
                            results_of_attributions.append(possible_citation)
                    except IndexError:
                        string_to_check = query_parts[attributed_idxs[i]]
                        possible_citation = check_for_verse(verse_id=verse_id, string_to_check=string_to_check)
                        results_of_attributions.append(possible_citation)
                else:
                    skip = False
                    continue

    # TODO: zde pak přidat všechny další parametry nalezené citace --> pak se to vrátí a přidá do výsledného DF.
    if len(results_of_attributions) == 1:
        results_of_attributions[0]['multiple_attribution'] = False
        results_of_attributions[0]['journal'] = journal
        results_of_attributions[0]['date'] = issue_date
        results_of_attributions[0]['page_num'] = issue_page
        results_of_attributions[0]['uuid'] = issue_uuid
        results_of_attributions[0]['kramerius_url'] = kramerius_url
    else:
        for res in results_of_attributions:
            res['multiple_attribution'] = True
            res['journal'] = journal
            res['date'] = issue_date
            res['page_num'] = issue_page
            res['uuid'] = issue_uuid
            res['kramerius_url'] = kramerius_url

    return results_of_attributions
            

In [26]:
final_dataframe_column_names = ['verse_id', 'verse_text', 'matched_subverses', 'query_string', 'matched_characters', 'matched_subverses_score', 'exclusives_match',  'multiple_attribution', 'match_probability', 'journal', 'date', 'page_num', 'uuid', 'kramerius_url']


def check_results_2(results_filename='batch_results.csv', save=True, return_df=False):
    """ This functions applies further checks on the preliminary results. """
    # Load results:
    results_dataframe = load_results(results_filename)

    # Remove duplicate rows from the result dataframe
    print('Original size of the results dataframe:', len(results_dataframe))
    results_dataframe.drop_duplicates(subset=['verse_id', 'query_file', 'index_query_part'], keep='first', inplace=True)
    print('Size of the results dataframe after droping duplicates:', len(results_dataframe))

    # Create (empty) final results dataframe:
    final_results = {}
    res_id = 0

    rows_to_skip = []
    print_progress = 0
    iter_ = 0
    for row_id in results_dataframe.index:
        print_progress += 1
        iter_ += 1
        if print_progress >= 500:
            print(iter_, '/', len(results_dataframe))
            print_progress = 0

        # TODO: udělat fci, která nebude kontrolovat tohle všechno, je to celkem zbytečný a nějaký vteřinky to asi ubírá...
        verse_id, query_file, index_query_part, batch_id, ngram_size, query_window_len, query_overlap, ngram_tolerance, edit_distance_tolerance = get_row_data(dataframe=results_dataframe, row_id=row_id)
  
        if row_id in rows_to_skip:
            continue
        else:
            attributed_verses, add_to_skip = select_attributions_to_json(dataframe=results_dataframe, query_file=query_file)
            rows_to_skip.extend(add_to_skip)

            results = evaluate_attributions_in_doc(attributed_verses=attributed_verses, query_file=query_file, query_window_len=query_window_len, query_overlap=query_overlap)

            for res in results:
                final_results[res_id] = res
                res_id += 1

    final_results_df = pd.DataFrame.from_dict(final_results)
    final_results_df = final_results_df.transpose()
    
    if save:
        final_results_df.to_csv(os.path.join(RESULTS_PATH, f'FILTERED_{results_filename}'), encoding='utf-8', quotechar='"', sep=';')

    if return_df:
        return final_results_df

In [None]:
final_results_df = check_results_2(return_df=True)

In [27]:
# NOTE: unlike in previous versions, the used translation is ignored, we do not have good data to evaluate this issue well.

final_dataframe_column_names = ['verse_id', 'verse_text', 'matched_subverses', 'query_string', 'matched_characters', 'matched_subverses_score', 'exclusives_match',  'multiple_attribution', 'match_probability', 'journal', 'date', 'page_num', 'uuid', 'kramerius_url']


def make_full_search_dataframe(results_filename='batch_results.csv', save=True, return_df=False):
    """ This functions converts the preliminary results to structure same as all of the other results (filtered and improved). This is for purely statistical reasons. It only drops duplicates. """
    # Load results:
    results_dataframe = load_results(results_filename)

    # Remove duplicate rows from the result dataframe
    print('Original size of the results dataframe:', len(results_dataframe))
    results_dataframe.drop_duplicates(subset=['verse_id', 'query_file', 'index_query_part'], keep='first', inplace=True)
    print('Size of the results dataframe after droping duplicates:', len(results_dataframe))

    # Create (empty) final results dataframe:
    final_results = {}
    res_id = 0

    for row_id in results_dataframe.index:
        verse_id, query_file, index_query_part, batch_id, ngram_size, query_window_len, query_overlap, ngram_tolerance, edit_distance_tolerance = get_row_data(dataframe=results_dataframe, row_id=row_id)

        path_to_json = get_path_to_json_file(query_file)
        journal, issue_date, issue_page, issue_uuid, kramerius_url, full_query_string = load_metadata_from_json(json_path=path_to_json)

        row_dict = results_dataframe.loc[row_id].to_dict()

        row_dict['verse_id'] = verse_id
        row_dict['book'] = bip.get_book_id(verse_id)
        row_dict['journal'] = journal
        row_dict['date'] = issue_date

        final_results[res_id] = row_dict
        res_id += 1

    final_results_df = pd.DataFrame.from_dict(final_results)
    final_results_df = final_results_df.transpose()
    
    if save:
        final_results_df.to_csv(os.path.join(RESULTS_PATH, f'UNFILTERED_{results_filename}'), encoding='utf-8', quotechar='"', sep=';')

    if return_df:
        return final_results_df


In [28]:
make_full_search_dataframe('batch_results.csv') 

Original size of the results dataframe: 56741
Size of the results dataframe after droping duplicates: 37889


In [52]:
def repair_missing_date(results_filename='batch_results.csv', save=True, return_df=False):
    """ This function repairs wrong date with uuid 334149b0-877c-11e6-8aeb-5ef3fc9ae867 that has as a date 27.-30.06.1927 and not a single date... """
    results_dataframe = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=';', encoding='utf-8', index_col=0)
    
    results_dataframe['date'] = results_dataframe['date'].replace('27.-30.06.1927','30.06.1927')

    for row_id in results_dataframe.index:
        if results_dataframe.loc[row_id]['uuid'] == '334149b0-877c-11e6-8aeb-5ef3fc9ae867':
            print(results_dataframe.loc[row_id]['date'])

    if save:
        results_dataframe.to_csv(os.path.join(RESULTS_PATH, f'DATECHANGE_{results_filename}'), encoding='utf-8', quotechar='"', sep=';')

    if return_df:
        return results_dataframe

# TODO: to datum změnit raději přímo  JSON souborech!!! (do publikované finální verze)

In [51]:
repair_missing_date('FILTERED_batch_results_official.csv')

30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927
30.06.1927


# Helps for by-hand evaluation
Add some description why and what to do...

## Filtering stop subverses
- There are some limits - sometimes only a section is detected due to the use of bad translation... then we are in a risk of loosing a citation (but the general tendency is to have the sure citations and not necessarily all of them...)
- There is a chance that this also solves some of the multiple attributions (only the best result may remain)
- Usually, when we lost a Biblical reference in this step, it is at worst at the level of allusions, not direct citations (but sure, we have to be careful in the selections)
- this simple step (selecting some 326 most "meaningless" verses) filtered out 10717 detected citations... well, another 19510 still remained (before: 28674)

### OVĚŘIT:
- speciálně ověřit výskyt: ['Ty vrchole dokonalosti'] (Ez 28:12), zda to rčení nemá právě oporu v biblickém textu, tedy by bylo často používané, a někdy i biblicky?

In [29]:
def filter_stop_subs(results_filename='batch_results.csv', input_df=False, subverse_len=21, rewrite_original_csv=False, save=True, return_df=False):
    """ This function filters those results that are detected based on solely one subverse that is listed in file evaluation_stop_subverses_{subverse_len}.txt """
    if input_df is not False:
        original_df = input_df
    else:
        original_df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=';', encoding='utf-8', index_col=0)

    print('Length of the original dataframe is:', len(original_df))

    with open(os.path.join(ROOT_PATH, f'evaluation_stop_subverses_{subverse_len}.txt'), 'r', encoding='utf-8') as stops_f:
        data = stops_f.read()
        stop_subs = data.split('\n')

    print('Number of stop subverses to filter:', len(set(stop_subs)))

    filtered_df_dict = {}
    fil_id = 0  

    for row_id in original_df.index:
        if original_df.loc[row_id]['matched_subverses'] in stop_subs:
            continue
        else:
            row_as_dict = original_df.loc[row_id].to_dict()            
            filtered_df_dict[fil_id] = row_as_dict
            fil_id += 1

    filtered_df = pd.DataFrame.from_dict(filtered_df_dict)
    filtered_df = filtered_df.transpose()

    print('Length of the filtered dataframe is:', len(filtered_df))
    print('Number of filtered rows:', len(original_df)-len(filtered_df))

    if rewrite_original_csv:
        filtered_df.to_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', sep=';', encoding='utf-8')
    
    if save:
        filtered_df.to_csv(os.path.join(RESULTS_PATH, f'ST_SUBS_{results_filename}'), quotechar='"', sep=';', encoding='utf-8')
    
    if return_df:
        return filtered_df
    

In [44]:
filtered_df = filter_stop_subs(results_filename='FILTERED_batch_results_all_dates.csv', return_df=True)
filtered_df.head()

Length of the original dataframe is: 28674
Number of stop subverses to filter: 327
Length of the filtered dataframe is: 17956
Number of filtered rows: 10718


Unnamed: 0,verse_id,verse_text,matched_subverses,query_string,matched_characters,matched_subverses_score,exclusives_match,match_probability,multiple_attribution,journal,date,page_num,uuid,kramerius_url
0,Mt 5:38,"Slyšeli ste, že řečeno bylo: Oko za oko, a zub...",['Slyšeli ste že řečeno bylo'],"Proč neúnavně hlásal, že Starý zákon byl jen p...",0.961538,0.333333,1.0,0.320513,PRAVDA,Čech,28.07.1927,3,00114990-8782-11e6-8aeb-5ef3fc9ae867,https://kramerius5.nkp.cz/view/uuid:00114990-8...
1,Mt 5:43,Slyšeli ste že řečeno bylo: Milovati budeš bli...,['Slyšeli ste že řečeno bylo'],"Proč neúnavně hlásal, že Starý zákon byl jen p...",0.961538,0.333333,1.0,0.320513,PRAVDA,Čech,28.07.1927,3,00114990-8782-11e6-8aeb-5ef3fc9ae867,https://kramerius5.nkp.cz/view/uuid:00114990-8...
2,1J 5:6,"Ježíš Kristus jest ten, který přišel skrze vod...",['Ježíš Kristus jest ten'],Týž Fr. W. Förster píše: „To je právě zatwzelo...,0.863636,0.25,1.0,0.215909,PRAVDA,Čech,28.07.1927,3,00114990-8782-11e6-8aeb-5ef3fc9ae867,https://kramerius5.nkp.cz/view/uuid:00114990-8...
3,Jud 11:21,"Splní-li mi to Bůh tvůj, co dobrého mi slibuje...",['co dobrého mi slibuješ'],"a tleskal, až mne dlaně pálily, ale cr, Herold...",0.863636,0.2,1.0,0.172727,NEPRAVDA,Čech,01.01.1926,3,002dc620-8bff-11e6-8aeb-5ef3fc9ae867,https://kramerius5.nkp.cz/view/uuid:002dc620-8...
4,R 2:12,"Ti totiž, kteří zhřešili bez zákona, bez zákon...",['zákonem budou odsouzeni'],"Paní Vršíčková nepustivší vaničky z levé ruky,...",0.869565,0.166667,1.0,0.144928,NEPRAVDA,Čech,01.01.1926,4,002dc620-8bff-11e6-8aeb-5ef3fc9ae867,https://kramerius5.nkp.cz/view/uuid:002dc620-8...


## Multiple attributions
- Multiple attributions do for more than a halfe of the attributions, that is something that can be resolved by a script. Here is a suggested solution.

### Steps:
- go by rows.
- if a row has True for multiple attribution, select all other verses that are attributed to the same passage - uuid, page_num, journal, but also the query_string!
- the database will have another column added: "drop?"
- select the attributed citation with the best match_probability and give False to its "drop?"
- give True "drop?" to the rest --> then just quickly check them manually.
- also add a column "CITATION" - where the final evaluation wil be applied
- hand in hand with this process, evaluate those verses, where we are sure it is a citation
    - criteria:
        - all subverses are present (when there are 1 to 3 subverses in verse); or at least 50% of subverses are present (4 and more subverses in verse)
        - 


In [54]:
def select_multiply_attributed_rows(dataframe:pd.core.frame.DataFrame, row_id):
    """ This finction selects all rows that share same multiple attribution. """
    uuid = dataframe.loc[row_id]['uuid']
    query_string = dataframe.loc[row_id]['query_string']

    other_attributions_df = dataframe[dataframe['uuid'] == uuid]
    other_attributions_df = other_attributions_df[other_attributions_df['multiple_attribution'] == True]
    other_attributions_df = other_attributions_df[other_attributions_df['query_string'] == query_string]

    row_ids_to_skip = other_attributions_df.index

    return other_attributions_df, row_ids_to_skip

In [31]:
def evaluate_multiple_attributions(subset_dataframe:pd.core.frame.DataFrame):
    """ This function evaluates the DROP value of respective rows. """
    match_probability_values = []
    for row_id in subset_dataframe.index:
        match_probability_values.append(subset_dataframe.loc[row_id]['match_probability'])
    
    best_score = max(match_probability_values)

    output_dicts = []

    num_of_rows_to_drop = 0
    
    for i, mpv in enumerate(match_probability_values):
        if mpv == best_score:
            df_dict = subset_dataframe.loc[subset_dataframe.index[i]].to_dict()
            df_dict['drop?'] = False
            output_dicts.append(df_dict)
        else:
            df_dict = subset_dataframe.loc[subset_dataframe.index[i]].to_dict()
            df_dict['drop?'] = True
            output_dicts.append(df_dict)
            num_of_rows_to_drop += 1

    return output_dicts, num_of_rows_to_drop


In [32]:
def resolve_multiple_attributions(results_filename='ST_SUBS_FILTERED_batch_results.csv', input_df=False, rewrite_original_csv=False, save=True, return_df=False):
    """ This function suggest which of the multiple attribution is the right one. """
    if input_df is not False:
        original_df = input_df
    else:
        original_df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=';', encoding='utf-8', index_col=0)

    output_df_dict = {}
    out_idx = 0

    num_of_rows_to_drop = 0

    rows_to_skip = []

    for row_id in original_df.index:
        if row_id in rows_to_skip:
            continue
        else:
            if original_df.loc[row_id]['multiple_attribution']:
                other_attributions_df, add_to_skip = select_multiply_attributed_rows(dataframe=original_df, row_id=row_id)
                rows_to_skip.extend(add_to_skip)
                if len(other_attributions_df) == 1:
                    row_as_dict = original_df.loc[row_id].to_dict()
                    row_as_dict['drop?'] = False
                    output_df_dict[out_idx] = row_as_dict
                    out_idx += 1
                else:
                    rows_to_add, rows_to_drop_count = evaluate_multiple_attributions(subset_dataframe=other_attributions_df)
                    num_of_rows_to_drop += rows_to_drop_count
                    for rta in rows_to_add:
                        output_df_dict[out_idx] = rta
                        out_idx += 1
            else:
                row_as_dict = original_df.loc[row_id].to_dict()
                row_as_dict['drop?'] = False
                output_df_dict[out_idx] = row_as_dict
                out_idx += 1

    filtered_df = pd.DataFrame.from_dict(output_df_dict)
    filtered_df = filtered_df.transpose()

    print('Number of rows selected for drop:', num_of_rows_to_drop, 'out of', len(original_df))

    if rewrite_original_csv:
        filtered_df.to_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', sep=';', encoding='utf-8')
    
    if save:
        filtered_df.to_csv(os.path.join(RESULTS_PATH, f'MA_{results_filename}'), quotechar='"', sep=';', encoding='utf-8')
    
    if return_df:
        return filtered_df

In [None]:
resolve_multiple_attributions()

In [34]:
def mark_sure_citations(results_filename='MA_ST_SUBS_FILTERED_batch_results.csv', input_df=False, rewrite_original_csv=False, save=True, return_df=False):
    """ This function marks some of the citations as sure citations while other unsure. """
    if input_df is not False:
        original_df = input_df
    else:
        original_df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=';', encoding='utf-8', index_col=0)

    output_df_dict = {}
    out_idx = 0

    num_of_sure_citations = 0

    for row_id in original_df.index:
        row_as_dict = original_df.loc[row_id].to_dict()

        num_of_subverses_in_verse = len(bip.split_verse(row_as_dict['verse_text'], tole_len=21))
        match_subs_score = row_as_dict['matched_subverses_score']

        if num_of_subverses_in_verse <= 2:
            if match_subs_score == 1 and row_as_dict['exclusives_match'] == 1:
                row_as_dict['CITATION'] = True
                num_of_sure_citations += 1
            else:
                row_as_dict['CITATION'] = False
        elif num_of_subverses_in_verse <= 4:
            if match_subs_score >= 0.8 and row_as_dict['exclusives_match'] == 1:
                row_as_dict['CITATION'] = True
                num_of_sure_citations += 1
            else:
                row_as_dict['CITATION'] = False

        else:
            if match_subs_score >= 0.5 and row_as_dict['exclusives_match'] == 1:
                row_as_dict['CITATION'] = True
                num_of_sure_citations += 1
            else:
                row_as_dict['CITATION'] = False

        output_df_dict[out_idx] = row_as_dict
        out_idx += 1
    
    filtered_df = pd.DataFrame.from_dict(output_df_dict)
    filtered_df = filtered_df.transpose()

    print('Number of rows selected as sure citations:', num_of_sure_citations, 'out of', len(original_df))

    if rewrite_original_csv:
        filtered_df.to_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', sep=';', encoding='utf-8')
    
    if save:
        filtered_df.to_csv(os.path.join(RESULTS_PATH, f'FINAL_{results_filename}'), quotechar='"', sep=';', encoding='utf-8')
    
    if return_df:
        return filtered_df


In [353]:
mark_sure_citations('MA_ST_SUBS_FILTERED_batch_results_2.csv')

Number of rows selected as sure citations: 1923 out of 17957


### Run all imporovements at once

In [59]:
def evaluate_filtered_citations(results_filename='FILTERED_batch_results.csv', save=True, return_df=False, save_steps=False):
    """ This function runs all of the above prepared functions that together evaluate the preliminary discoverd citations. """
    stop_subs_df = filter_stop_subs(results_filename=results_filename, save=save_steps, return_df=True)
    multiple_attrs_df = resolve_multiple_attributions(results_filename=f'ST_SUBS_{results_filename}', input_df=stop_subs_df, save=save_steps, return_df=True)
    final_df = mark_sure_citations(results_filename=results_filename, input_df=multiple_attrs_df, save=save, return_df=return_df)
    
    if return_df:
        return final_df

In [81]:
evaluate_filtered_citations('FILTERED_batch_results_full.csv', save_steps=True)

Length of the original dataframe is: 28666
Number of stop subverses to filter: 327
Length of the filtered dataframe is: 17948
Number of filtered rows: 10718
Number of rows selected for drop: 4658 out of 17948
Number of rows selected as sure citations: 926 out of 17948


In [61]:
def filter_only_sure_citations(results_filename='FINAL_FILTERED_batch_results.csv', save=True, return_df=False):
    """ This function filters only the citations that are marked as "sure" citations, and it also drops those that have the "drop" mark. """
    original_df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=';', encoding='utf-8', index_col=0)

    output_df_dict = {}
    out_idx = 0
    
    for row_id in original_df.index:
        if original_df.loc[row_id]['CITATION'] == True and original_df.loc[row_id]['drop?'] == False:
            output_df_dict[out_idx] = original_df.loc[row_id].to_dict()
            out_idx += 1

    output_df = pd.DataFrame.from_dict(output_df_dict)
    output_df = output_df.transpose()

    if save:
        output_df.to_csv(os.path.join(RESULTS_PATH, f'CITATIONS_{results_filename}'), quotechar='"', sep=',', encoding='utf-8')

    if return_df:
        return output_df


In [85]:
filter_only_sure_citations(results_filename='FINAL_FILTERED_batch_results_full.csv')

In [63]:
def change_date_to_year(results_filename='CITATIONS_FINAL_FILTERED_batch_results.csv'):
    """ This function changes the dates to years only. """
    original_df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=',', encoding='utf-8', index_col=0)

    output_df_dict = {}
    out_idx = 0
    
    for row_id in original_df.index:
        date = original_df.loc[row_id]['date']
        date_parts = date.split('.')

        row_dict = original_df.loc[row_id].to_dict()
        
        row_dict['date'] = date_parts[2]
        
        output_df_dict[out_idx] = row_dict
        out_idx += 1

    output_df = pd.DataFrame.from_dict(output_df_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, f'YEAR_{results_filename}'), quotechar='"', sep=',', encoding='utf-8')


In [86]:
change_date_to_year(results_filename='CITATIONS_FINAL_FILTERED_batch_results_full.csv')

In [67]:
def make_table_by_years_for_verse(results_filename='YEAR_CITATIONS_FINAL_FILTERED_batch_results_full.csv'):
    """ This function makes a csv that includes only the verse ids and their number in individual years. """
    original_df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=',', encoding='utf-8', index_col=0)

    all_cited_verses = list(set(original_df['verse_id'].values.tolist()))

    years = [1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939]

    output_dict = {}
    out_idx = 0

    for verse_id in all_cited_verses:
        verse_df = original_df[original_df['verse_id'] == verse_id]
        verse_dict = {}
        verse_dict['verse_id'] = verse_id
        for year in years:
            year_df = verse_df[verse_df['date'] == year]
            verse_dict[year] = len(year_df)
        
        output_dict[out_idx] = verse_dict
        out_idx += 1

    output_df = pd.DataFrame.from_dict(output_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'citations_by_year.csv'), quotechar='"', sep=',', encoding='utf-8')

In [87]:
make_table_by_years_for_verse()

In [73]:
def aggregate_citations(results_filename='YEAR_CITATIONS_FINAL_FILTERED_batch_results_full.csv'):
    """ This function makes a csv that includes only the verse ids and their number in individual years. """
    original_df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=',', encoding='utf-8', index_col=0)

    all_cited_verses = list(set(original_df['verse_id'].values.tolist()))

    output_dict = {}
    out_idx = 0

    for verse_id in all_cited_verses:
        verse_df = original_df[original_df['verse_id'] == verse_id]
        verse_dict = {}
        verse_dict['verse_id'] = verse_id
        verse_dict['count'] = len(verse_df)
        verse_dict['verse_text'] = verse_df.iloc[0]['verse_text']

        
        output_dict[out_idx] = verse_dict
        out_idx += 1

    output_df = pd.DataFrame.from_dict(output_dict)
    output_df = output_df.transpose()

    output_df.to_csv(os.path.join(RESULTS_PATH, 'aggregated_citations.csv'), quotechar='"', sep=',', encoding='utf-8')

In [88]:
aggregate_citations()

In [79]:
def filter_Gn_1_1(results_filename):
    """ This function is used to filter out Gn 1:1 in BSV translation - because there has been a mess up and the text is was wrong there 'První Knihy Mojžíšovy' """
    df = pd.read_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', delimiter=';', encoding='utf-8', index_col=0)

    print(len(df))

    output_dict = {}
    out_idx = 0
    skipped = 0
    for row_id in df.index:
        row_dict = df.loc[row_id].to_dict()
        if row_dict['verse_text'] == 'První Knihy Mojžíšovy':
            skipped += 1
            continue
        else:
            output_dict[out_idx] = row_dict
            out_idx += 1

    print(skipped)

    output_df = pd.DataFrame.from_dict(output_dict)
    output_df = output_df.transpose()

    print(len(output_df))

    print((len(output_df)+skipped), (len(df)))

    output_df.to_csv(os.path.join(RESULTS_PATH, results_filename), quotechar='"', sep=';', encoding='utf-8')

In [80]:
filter_Gn_1_1('FILTERED_batch_results_full.csv')

28674
8
28666
28674 28674


## "Same" verses
- this part connects those verses that have the same text...

In [None]:
mutual_verses = {
    'L 11:3/Mt 6:11': ['L 11:3', 'Mt 6:11'],
    'Mk 13:31/Mt 24:35/L 21:33': ['Mk 13:31', 'Mt 24:35', 'L 21:33'],
    'Ex 20:16/Dt 5:20': ['Ex 20:16', 'Dt 5:20'],
    '2K 1:2/Fp 1:2/2Te 1:2/1K 1:3/Ef 1:2/Ga 1:3': ['2K 1:2', 'Fp 1:2', '2Te 1:2', '1K 1:3', 'Ef 1:2', 'Ga 1:3'],
    'Mt 11:15/Mt 13:9': ['Mt 11:15', 'Mt 13:9']
    
}