In [13]:
'''imports'''

from bs4 import BeautifulSoup as bs
import pandas as pd
import collections
import re
import nltk

In [2]:
def make_soup(file_path):
    '''function takes file path to xml document reads in a uses bs to create and return soup object of the whole file.'''
    content = list()
    # Read the XML file
    with open(file_path, "r", encoding= 'utf-8') as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")
        
        return bs_content

In [51]:
def indexed_kopie_line_dict(bs_content):
    '''function takes the soup object outputted from make_soup finds all l labels and uses this information to find all kopie
    labels within them. Returns a list of dictionaries, where each dictionary has 
    three keys.
    text: The text from the line
    index: The index of the line within the text body
    kopie: binary 0/1 where 1 is kopie and 0 is non-kopie tagged'''
    
    #finding all ines ('l' tags)
    everything_results = bs_content.find_all("l")
    
    #creating the list of dictionaries
    dict_list = list()
    n = 0
    everything_results = bs_content.find_all("l")
    for item in list(everything_results):
        item_string = str(item)
        temp_dict =dict()
        n+=1
        if '<kopie' in item_string:
            xml = item.find('kopie')
            temp_dict['text'] = xml.text
            temp_dict['index']= n
            temp_dict['kopie']= 1
            dict_list.append(temp_dict)
            tokenized_item = nltk.word_tokenize(item.text)
            tokenized_xml = nltk.word_tokenize(xml.text)
            c = collections.Counter(tokenized_item) - collections.Counter(tokenized_xml)                   
            text_diff = ' '.join(c.elements())
            if len(text_diff) != 0:
                diff_dict = dict()
                diff_dict['text'] = text_diff
                diff_dict['index'] = n
                diff_dict['kopie'] = 0
                dict_list.append(diff_dict)
        else:
            temp_dict['text'] =item.text
            temp_dict['index'] = n
            temp_dict['kopie'] = 0
            dict_list.append(temp_dict)
            
    return dict_list

In [4]:
def ranges(nums):
    '''finds ranges of consecutive ints in a given sequence'''
    nums = sorted(set(nums))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s+1 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))

In [5]:
def get_index_groups(new_df):
    '''Function takes a DataFrame representation of the output of indexed_kopie_line_dict and first seperates the kopies and
    non-kopies before finding the consecutive index ranges for each group. Outputs two objest, each is a list of tuples which
    represent the consecutive index ranges. the first is for kopie and the second is for non-kopie'''
    
    #seperating kopies and non-kopies
    kopie_bool = new_df['kopie'] == 1
    kopie_df = new_df[kopie_bool]
    no_kopie_bool = new_df['kopie'] == 0
    no_kopie_df = new_df[no_kopie_bool]
    
    #getting index ranges
    index_groups_kopie = ranges(kopie_df['index'])
    index_groups_no_kopie = ranges(no_kopie_df['index'])
    
    return index_groups_kopie, index_groups_no_kopie

In [6]:
def collect_row_locs(index_groups_list, new_df):
    '''function takes a list of tuples representing the index groups and finds the location of the rows for the lines within
    the index range in the large df.
    Returns a list of dictionaries with two keys.
    index_range: the tuple representing the the start and end index of the chunk.
    loc_list: the list containing the locations for each of the rows in new_df which are part of the chunk'''
    
    loc_main = list()
    for group in index_groups_list:
        loc_group_dict =dict()
        loc_group_dict['index_range'] = group
        loc_list = list()
        for i in range (group[0],group[1]+1):
            loc_list += list(new_df.loc[new_df['index'] == i].index)
        loc_group_dict['loc_list'] = loc_list
        loc_main.append(loc_group_dict)
        
    return loc_main

In [7]:
def new_collected_lists(new_df, locs_list, tag = 'kopie'):

    main_list = list()
    for group_dict in locs_list:
        temp_dict = dict()

        if tag == 'kopie':
            temp_dict['kopie'] = 1
            kopie_df = new_df.loc[group_dict['loc_list']]
            filtered = kopie_df.loc[kopie_df['kopie']==1]
            group_text = list(filtered['text'])
            temp_dict['text'] = " ".join(group_text) 

        else:
            temp_dict['kopie'] = 0
            no_kopie_df = new_df.loc[group_dict['loc_list']]
            filtered = no_kopie_df.loc[no_kopie_df['kopie']==0]
            group_text= list(filtered['text'])
            temp_dict['text'] = " ".join(group_text) 

        temp_dict['index_range'] = group_dict['index_range']

        main_list.append(temp_dict)

    return main_list

In [8]:
def make_final_df(kopie_list, no_kopie_list):
    '''function takes the two lists of dictionaries obtained from running make_collected_lists on the kopies and non-kopies
    and joins them together before creating and returning a large df with all the information.'''
    
    just_nk_df = pd.DataFrame(no_kopie_list)
    just_k_df = pd.DataFrame(kopie_list)
    
    frames= [just_nk_df, just_k_df]
    concatenated_df = pd.concat(frames)
    
    sorted_df = concatenated_df.sort_values(by ='index_range' )
    
    return sorted_df

In [65]:
def main_function(filepath):
    
    print('reading input file...')
    
    bs_content = make_soup(filepath)
    
    dict_list = indexed_kopie_line_dict(bs_content)
    
    new_df = pd.DataFrame(dict_list)
    new_df = new_df.loc[(new_df['text'] != '') & (new_df['text'] != ' ')]
    for i, row in new_df.iterrows():
        text = row['text']
        no_hyphen = text.replace('¬', ' ')
        no_punc = re.sub(r'[^\w\s]','',no_hyphen)
        new_df.at[i,'text'] = no_punc
        
    print('generating fragments...')
    
    index_groups_kopie = get_index_groups(new_df)[0]
    index_groups_no_kopie = get_index_groups(new_df)[1]
    
    locs_kopie = collect_row_locs(index_groups_kopie, new_df)
    locs_no_kopie = collect_row_locs(index_groups_no_kopie, new_df)
    
    kopie_list = new_collected_lists( new_df, locs_kopie, tag= 'kopie')
    no_kopie_list = new_collected_lists(new_df, locs_no_kopie, tag= 'no_kopie')
    
    data = make_final_df(kopie_list, no_kopie_list)
    data['text'] = data['text'].str.replace('\d+', '')
    df_over_100 = data[data['text'].apply(lambda x: len(nltk.word_tokenize(x)) > 100)]
    
    print(f'writing {df_over_100.shape[0]} fragments to text files...')
    
    for index, row in df_over_100.iterrows():
    
        if row['kopie'] == 1:
            filename = 'kopie_' + str(row['index_range'][0])+'_'+str(row['index_range'][1]) +'.txt'
            opening = open(filename, 'w', encoding = 'utf-8')
            opening.writelines(row['text'])
            opening.close()

        else:
            filename = 'no_kopie_' + str(row['index_range'][0])+'_'+str(row['index_range'][1]) +'.txt'
            opening = open(filename, 'w', encoding= 'utf-8')
            opening.writelines(row['text'])
            opening.close()
        
    print('files ready')
    
    return df_over_100

In [63]:
main_function("C:/Users/Ellie/Documents/MASTERS/NIAA/new_allie/1791_Purm_Louw_kopie_tei.xml")

reading file...
generating fragments...
writing 299 fragments to text files...
files ready
