In [1]:
'''imports'''

from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
def make_soup(file_path):
    '''function takes file path to xml document reads in a uses bs to create and return soup object of the whole file.'''
    content = list()
    # Read the XML file
    with open(file_path, "r", encoding= 'utf-8') as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")
        
        return bs_content

In [3]:
#bs_content = make_soup("1791_Purm_Louw_kopie_tei.xml")

In [4]:
'''Using find_all(l) method of find kopies'''

def indexed_kopie_line_dict(bs_content):
    '''function takes the soup object outputted from make_soup finds all l labels and uses this information to find all kopie
    labels within them. Returns a list of dictionaries, where each dictionary has 
    three keys.
    text: The text from the line
    index: The index of the line within the text body
    kopie: binary 0/1 where 1 is kopie and 0 is non-kopie tagged'''
    
    #finding all ines ('l' tags)
    everything_results = bs_content.find_all("l")
    
    #creating the list of dictionaries
    dict_list = list()
    n = 0
    for item in list(everything_results):
        item_string = str(item)
        temp_dict =dict()
        n+=1
        if '<kopie>' in item_string:
            temp_dict['text'] = item.text
            temp_dict['index']= n
            temp_dict['kopie']= 1
            dict_list.append(temp_dict)
        else:
            temp_dict['text'] =item.text
            temp_dict['index'] = n
            temp_dict['kopie'] = 0
            dict_list.append(temp_dict)
            
    return dict_list

In [5]:
#dict_list = indexed_kopie_line_dict(bs_content)

In [6]:
'''converting the output of indexed_kope_line_dict into a DataFrame'''
#new_df = pd.DataFrame(dict_list)

In [7]:
def ranges(nums):
    '''finds ranges of consecutive ints in a given sequence'''
    nums = sorted(set(nums))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s+1 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))

In [8]:
def get_index_groups(new_df):
    '''Function takes a DataFrame representation of the output of indexed_kopie_line_dict and first seperates the kopies and
    non-kopies before finding the consecutive index ranges for each group. Outputs two objest, each is a list of tuples which
    represent the consecutive index ranges. the first is for kopie and the second is for non-kopie'''
    
    #seperating kopies and non-kopies
    kopie_bool = new_df['kopie'] == 1
    kopie_df = new_df[kopie_bool]
    no_kopie_bool = new_df['kopie'] == 0
    no_kopie_df = new_df[no_kopie_bool]
    
    #getting index ranges
    index_groups_kopie = ranges(kopie_df['index'])
    index_groups_no_kopie = ranges(no_kopie_df['index'])
    
    return index_groups_kopie, index_groups_no_kopie

In [10]:
#index_groups_kopie = get_index_groups(new_df)[0]
#index_groups_no_kopie = get_index_groups(new_df)[1]

In [11]:
def collect_row_locs(index_groups_list, new_df):
    '''function takes a list of tuples representing the index groups and finds the location of the rows for the lines within
    the index range in the large df.
    Returns a list of dictionaries with two keys.
    index_range: the tuple representing the the start and end index of the chunk.
    loc_list: the list containing the locations for each of the rows in new_df which are part of the chunk'''
    
    loc_main = list()
    for group in index_groups_list:
        loc_group_dict =dict()
        loc_group_dict['index_range'] = group
        loc_list = list()
        for i in range (group[0],group[1]+1):
            loc_list += list(new_df.loc[new_df['index'] == i].index)
        loc_group_dict['loc_list'] = loc_list
        loc_main.append(loc_group_dict)
        
    return loc_main

In [12]:
#locs_kopie = collect_row_locs(index_groups_kopie, new_df)
#locs_no_kopie = collect_row_locs(index_groups_no_kopie, new_df)

In [14]:
def make_collected_list(locs_list, new_df, tag= 'kopie'):
    '''function takes the list of dictionaires outputted from collect_row_locs and uses this information to create and return
    a new list of dictionaires. each dictionary has three keys.
    index_range: the tuple representing the start and end index for the chunk
    kopie: a binary 0/1 where 1 is a kopie tag and 0 is a non-kopie tag
    text: a string of the text taken from all lines in the chunk joined together'''
    
    main_list = list()
    for group_dict in locs_list:
        temp_dict = dict()
        group_text = list(new_df.loc[group_dict['loc_list'], 'text'])
        temp_dict['index_range'] = group_dict['index_range']
        temp_dict['text'] = "".join(group_text)
        if tag == 'kopie':
            temp_dict['kopie'] = 1
        else:
            temp_dict['kopie'] = 0
        main_list.append(temp_dict)
        
    return main_list

In [16]:
#kopie_list = make_collected_list(locs_kopie, new_df, tag= 'kopie')
#no_kopie_list = make_collected_list(locs_kopie, new_df, tag= 'no_kopie')

In [17]:
def make_final_df(kopie_list, no_kopie_list):
    '''function takes the two lists of dictionaries obtained from running make_collected_lists on the kopies and non-kopies
    and joins them together before creating and returning a large df with all the information.'''
    
    whole_list= kopie_list+no_kopie_list
    
    whole_df =pd.DataFrame(whole_list)
    
    return whole_df

In [18]:
#final_df = make_final_df(kopie_list, no_kopie_list)

In [19]:
#final_df[0:5]

Unnamed: 0,index_range,kopie,text
0,"(73, 90)",1,"woorden, Nooijt stond ik, om dit in t'voorbij ..."
1,"(235, 235)",1,"Wat het Eerste betreft, dat er eene Meer¬"
2,"(238, 330)",1,Omtrent het Jaer 1400 onder de Re¬geringe van ...
3,"(351, 357)",1,"hier over het volgende ""Meermin, Sijreene,is e..."
4,"(428, 445)",1,"zelve gezien hebben ""Men vind in¬de Kronijk va..."


In [20]:
def main_function(filepath):
    
    bs_content = make_soup(filepath)
    
    dict_list = indexed_kopie_line_dict(bs_content)
    
    new_df = pd.DataFrame(dict_list)
    
    index_groups_kopie = get_index_groups(new_df)[0]
    index_groups_no_kopie = get_index_groups(new_df)[1]
    
    locs_kopie = collect_row_locs(index_groups_kopie, new_df)
    locs_no_kopie = collect_row_locs(index_groups_no_kopie, new_df)
    
    kopie_list = make_collected_list(locs_kopie, new_df, tag= 'kopie')
    no_kopie_list = make_collected_list(locs_kopie, new_df, tag= 'no_kopie')
    
    final_df = make_final_df(kopie_list, no_kopie_list)
    
    return final_df

In [21]:
data = main_function("1791_Purm_Louw_kopie_tei.xml")

In [22]:
data[0:10]

Unnamed: 0,index_range,kopie,text
0,"(73, 90)",1,"woorden, Nooijt stond ik, om dit in t'voorbij ..."
1,"(235, 235)",1,"Wat het Eerste betreft, dat er eene Meer¬"
2,"(238, 330)",1,Omtrent het Jaer 1400 onder de Re¬geringe van ...
3,"(351, 357)",1,"hier over het volgende ""Meermin, Sijreene,is e..."
4,"(428, 445)",1,"zelve gezien hebben ""Men vind in¬de Kronijk va..."
5,"(450, 487)",1,"in 't 1548, heeft Philippus Aartshertogvan Oos..."
6,"(494, 497)",1,"Meermin, daar men zoo veele Fabulenen verdigts..."
7,"(510, 537)",1,en Gevangen is; de Meerminnen zijnAgt spannen ...
8,"(540, 559)",1,"Indiäenen braeden deze Visch op eenRooster, en..."
9,"(595, 631)",1,Met Permissie enz:Werd bekend gemaakt dat alhi...


In [23]:
data.shape

(608, 3)