In [142]:
'''imports'''

from bs4 import BeautifulSoup as bs
import pandas as pd
import collections
import re

In [2]:
def make_soup(file_path):
    '''function takes file path to xml document reads in a uses bs to create and return soup object of the whole file.'''
    content = list()
    # Read the XML file
    with open(file_path, "r", encoding= 'utf-8') as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")
        
        return bs_content

In [3]:
def indexed_kopie_line_dict(bs_content):
    '''function takes the soup object outputted from make_soup finds all l labels and uses this information to find all kopie
    labels within them. Returns a list of dictionaries, where each dictionary has 
    three keys.
    text: The text from the line
    index: The index of the line within the text body
    kopie: binary 0/1 where 1 is kopie and 0 is non-kopie tagged'''
    
    #finding all ines ('l' tags)
    everything_results = bs_content.find_all("l")
    
    #creating the list of dictionaries
    dict_list = list()
    n = 0
    everything_results = bs_content.find_all("l")
    for item in list(everything_results):
        item_string = str(item)
        temp_dict =dict()
        n+=1
        if '<kopie' in item_string:
            xml = item.find('kopie')
            temp_dict['text'] = xml.text
            temp_dict['index']= n
            temp_dict['kopie']= 1
            dict_list.append(temp_dict)
            c = collections.Counter(item.text) - collections.Counter(xml.text)                   
            text_diff = ''.join(c.elements())
            if len(text_diff) != 0:
                diff_dict = dict()
                diff_dict['text'] = text_diff
                diff_dict['index'] = n
                diff_dict['kopie'] = 0
                dict_list.append(diff_dict)
        else:
            temp_dict['text'] =item.text
            temp_dict['index'] = n
            temp_dict['kopie'] = 0
            dict_list.append(temp_dict)
            
    return dict_list

In [4]:
def ranges(nums):
    '''finds ranges of consecutive ints in a given sequence'''
    nums = sorted(set(nums))
    gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s+1 < e]
    edges = iter(nums[:1] + sum(gaps, []) + nums[-1:])
    return list(zip(edges, edges))

In [5]:
def get_index_groups(new_df):
    '''Function takes a DataFrame representation of the output of indexed_kopie_line_dict and first seperates the kopies and
    non-kopies before finding the consecutive index ranges for each group. Outputs two objest, each is a list of tuples which
    represent the consecutive index ranges. the first is for kopie and the second is for non-kopie'''
    
    #seperating kopies and non-kopies
    kopie_bool = new_df['kopie'] == 1
    kopie_df = new_df[kopie_bool]
    no_kopie_bool = new_df['kopie'] == 0
    no_kopie_df = new_df[no_kopie_bool]
    
    #getting index ranges
    index_groups_kopie = ranges(kopie_df['index'])
    index_groups_no_kopie = ranges(no_kopie_df['index'])
    
    return index_groups_kopie, index_groups_no_kopie

In [6]:
def collect_row_locs(index_groups_list, new_df):
    '''function takes a list of tuples representing the index groups and finds the location of the rows for the lines within
    the index range in the large df.
    Returns a list of dictionaries with two keys.
    index_range: the tuple representing the the start and end index of the chunk.
    loc_list: the list containing the locations for each of the rows in new_df which are part of the chunk'''
    
    loc_main = list()
    for group in index_groups_list:
        loc_group_dict =dict()
        loc_group_dict['index_range'] = group
        loc_list = list()
        for i in range (group[0],group[1]+1):
            loc_list += list(new_df.loc[new_df['index'] == i].index)
        loc_group_dict['loc_list'] = loc_list
        loc_main.append(loc_group_dict)
        
    return loc_main

In [7]:
def make_collected_list(locs_list, new_df, tag= 'kopie'):
    '''function takes the list of dictionaires outputted from collect_row_locs and uses this information to create and return
    a new list of dictionaires. each dictionary has three keys.
    index_range: the tuple representing the start and end index for the chunk
    kopie: a binary 0/1 where 1 is a kopie tag and 0 is a non-kopie tag
    text: a string of the text taken from all lines in the chunk joined together'''
    
    main_list = list()
    for group_dict in locs_list:
        temp_dict = dict()
        group_text = list(new_df.loc[group_dict['loc_list'], 'text'])
        temp_dict['index_range'] = group_dict['index_range']
        temp_dict['text'] = " ".join(group_text)
        if tag == 'kopie':
            temp_dict['kopie'] = 1
        else:
            temp_dict['kopie'] = 0
        main_list.append(temp_dict)
        
    return main_list

In [8]:
def make_final_df(kopie_list, no_kopie_list):
    '''function takes the two lists of dictionaries obtained from running make_collected_lists on the kopies and non-kopies
    and joins them together before creating and returning a large df with all the information.'''
    
    just_nk_df = pd.DataFrame(no_kopie_list)
    just_k_df = pd.DataFrame(kopie_list)
    
    frames= [just_nk_df, just_k_df]
    concatenated_df = pd.concat(frames)
    
    sorted_df = concatenated_df.sort_values(by ='index_range' )
    
    return sorted_df

In [9]:
def main_function(filepath):
    
    bs_content = make_soup(filepath)
    
    dict_list = indexed_kopie_line_dict(bs_content)
    
    new_df = pd.DataFrame(dict_list)
    
    index_groups_kopie = get_index_groups(new_df)[0]
    index_groups_no_kopie = get_index_groups(new_df)[1]
    
    locs_kopie = collect_row_locs(index_groups_kopie, new_df)
    locs_no_kopie = collect_row_locs(index_groups_no_kopie, new_df)
    
    kopie_list = make_collected_list(locs_kopie, new_df, tag= 'kopie')
    no_kopie_list = make_collected_list(locs_no_kopie, new_df, tag= 'no_kopie')
    
    final_df = make_final_df(kopie_list, no_kopie_list)
    
    return final_df

In [10]:
data=main_function("C:/Users/Ellie/Documents/MASTERS/NIAA/new_allie/1791_Purm_Louw_kopie_tei.xml")

In [11]:
data.shape

(617, 3)

In [12]:
len(set(data['text']))

609

In [13]:
data[5:10]

Unnamed: 0,index_range,kopie,text
2,"(238, 330)",1,Omtrent het Jaer 1400 onder de Re¬ geringe van...
3,"(330, 351)",0,gebragt; //2 Dus ziet men uijt deze voortgebr...
3,"(351, 357)",1,"""Meermin, Sijreene, hhieeeeerr oovvtlgnd is..."
4,"(357, 428)",0,"eene plaets te beslaen"" e att;//3zoodh is d..."
4,"(428, 445)",1,"""Men vind in¬ zzeeeeeelv ginnhbb de Kronijk ..."


In [14]:
print(data.iloc[6]['text'])
print()
print(data.iloc[7]['text'])

gebragt;  //2 Dus ziet men uijt deze voortgebragte bewijzen  dat er, een Wijf, Vrouwe, of Meermin in de Purmer gevangen is, en dat het zeker is dat het geene Natuurelyke Vrouwe is geweest blijkt daar uijt, om dat 'Smenschen natuur niet geschikt is in 't Waetter maar op aarde te leven daar het Water het Element der Meerminnen of der@@@ is /1/ zie H: Soeteboom in zijne Korte Kron: van Purmerend op bladz:, 77 tot 80; /2/ zie Tafereel van Natuur en Konst 13de deel, bladz: 63 /3/ Mr Jan Leoninus te Edam is zijn gevoelen, dat dit schep¬ zel was een wild vrouwperzoon dat stom was, en ge¬ woon was in de ruygte en biezen van de Purmermeer zig te onthouden, geweest is; zie verhaal van de Vuur en Watenood door Dne Kruijthof en Leoninus pag: 4-7; Maer waerom zoo veel deze Meerminnen gemeld, indien het zeker is, dat dezelve maar door inbeeldinge werd voor gestelt, en dat erin de Weereld, zoodanig een Schepzel niet werd gevonden, Een zeker Voornaam Schreijver, meld "Meermin, Sijreene, hhieeeeerr    

In [24]:
'''Closer to working now, but still having some issues with: 

1) getting the range to make sense consistently (see notebook 02-03).

2) Also need to edit so that isolated sections such as 351 which appears in the non-grouped dataframe twice with 2 seperate text
sections because the whole line is not on kopie tag. The iloc prints above show this issue but also refer to notebook 02-03.'''

'Closer to working now, but still having some issues with: \n\n1) getting the range to make sense consistently (see notebook 02-03).\n\n2) Also need to edit so that isolated sections such as 351 which appears in the non-grouped dataframe twice with 2 seperate text\nsections because the whole line is not on kopie tag. The iloc prints above show this issue but also refer to notebook 02-03.'

In [None]:
'''Further work still to be done from Allies notes = 
3) There are empty lines in the XML, which results in two chunks of copied text, interrupted by a chunk of non-copied text, 
but that chunk is in fact an empty line. (See for example line 904, 906, 908 in the df.)

4) Once the above issues are fixed, the hyphens (¬) need to be removed from the chunks. 
(There is a line of code for that in my script parse_xml, in the repo I mailed you the link of yesterday.)

5) It will probably be best to remove all punctuation (but not before the lines are indexed!). 
(Again, code for that is in my script parse_xml.)

6) There are some other weird things appearing in the text chunks, which has probably to do something with the parsing.
Not a very urgent matter, but still puzzling. (See for example line 428-225 and 693-699 in the df.)'''

In [15]:
'''UNBOXED MAIN FUNCTION '''

bs_content = make_soup("C:/Users/Ellie/Documents/MASTERS/NIAA/new_allie/1791_Purm_Louw_kopie_tei.xml")
    
dict_list = indexed_kopie_line_dict(bs_content)
    
new_df = pd.DataFrame(dict_list)
    
index_groups_kopie = get_index_groups(new_df)[0]
index_groups_no_kopie = get_index_groups(new_df)[1]
    
locs_kopie = collect_row_locs(index_groups_kopie, new_df)
locs_no_kopie = collect_row_locs(index_groups_no_kopie, new_df)
    
kopie_list = make_collected_list(locs_kopie, new_df, tag= 'kopie')
no_kopie_list = make_collected_list(locs_no_kopie, new_df, tag= 'no_kopie')
    
final_df = make_final_df(kopie_list, no_kopie_list)

In [None]:
'''Working on 1) getting the range to make sense consistently (see notebook 02-03).'''

In [16]:
new_df[330:360]

Unnamed: 0,index,kopie,text
330,329,1,"gevangen daar uijtgetogen, en na Edam"
331,330,1,gebragt;
332,330,0,//2
333,331,0,Dus ziet men uijt deze voortgebragte bewijzen
334,332,0,"dat er, een Wijf, Vrouwe, of Meermin"
335,333,0,"in de Purmer gevangen is, en dat het zeker"
336,334,0,is dat het geene Natuurelyke Vrouwe is geweest
337,335,0,"blijkt daar uijt, om dat 'Smenschen natuur"
338,336,0,niet geschikt is in 't Waetter maar op aarde t...
339,337,0,daar het Water het Element der Meerminnen of d...


In [18]:
loc_list_n=list()
x=330
y=351
for i in range (x,y+1):
    loc_list_n += list(new_df.loc[new_df['index'] == i].index)
    #loc_list_n.append(new_df.loc[new_df['index'] == i].index)
    #loc_list_n.append(i)
    
    
print(loc_list_n)

[331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354]


In [None]:
'''realised that actually the range issues might not exist. It could be that this overlap etc looks so confusing because of
the repeated indexes where a line has been split into kopie and no-kopie sections. e.g. 331,332 & 353,354 in the example.
work out the split line issue first and then re-check the ranges to see if this is the case.'''

In [None]:
'''Working on 2) Also need to edit so that isolated sections such as 351 which appears in the non-grouped dataframe twice 
with 2 seperate text sections because the whole line is not on kopie tag. The iloc prints above show this issue but also 
refer to notebook 02-03.'''''

In [27]:
A = collections.Counter(new_df['index'])
min_threshold = 2
my_count = { x: count for x, count in A.items() if count >= min_threshold }
print(my_count)
print(f'Number of repeated line indexes {len(my_count)}. Shows how many times lines are split. And that there are never split more than once.')

{73: 2, 90: 2, 330: 2, 351: 2, 357: 2, 428: 2, 445: 2, 487: 2, 559: 2, 693: 2, 885: 2, 2201: 2, 2235: 2, 2269: 2, 2304: 2, 2336: 2, 2370: 2, 2786: 2, 3298: 2, 3505: 2, 3818: 2, 3832: 2, 4956: 2, 4966: 2, 5002: 2, 5013: 2, 5031: 2, 5132: 2, 5298: 2, 5692: 2, 5695: 2, 5762: 2, 6316: 2, 6715: 2, 7276: 2, 7881: 2, 8132: 2, 8576: 2, 8656: 2, 8662: 2, 8677: 2, 8701: 2, 8723: 2, 9069: 2, 9167: 2, 9224: 2, 9269: 2, 9577: 2, 9637: 2, 9745: 2, 9768: 2, 9986: 2, 10117: 2, 10189: 2, 10465: 2, 13612: 2, 13644: 2, 13654: 2, 13679: 2, 13712: 2, 15111: 2, 15333: 2, 15410: 2, 15640: 2, 15910: 2, 16296: 2, 16543: 2, 16576: 2, 16667: 2, 16679: 2, 16686: 2, 16718: 2, 16734: 2, 16781: 2, 16814: 2, 16846: 2, 16876: 2, 16906: 2, 16935: 2, 16967: 2, 16973: 2, 17126: 2, 17186: 2, 17219: 2, 17250: 2, 17282: 2, 17294: 2, 17314: 2, 17344: 2, 17374: 2, 17403: 2, 17432: 2, 17461: 2, 17494: 2, 17537: 2, 17559: 2, 17592: 2, 17628: 2, 17660: 2, 17750: 2, 17909: 2, 17935: 2, 18293: 2, 18311: 2, 18695: 2, 18748: 2, 1875

In [32]:
split_line_indices = list()
for key in my_count:
    split_line_indices.append(key)
    
print(split_line_indices)
print('provides a list of the repeated indices for testing')

[73, 90, 330, 351, 357, 428, 445, 487, 559, 693, 885, 2201, 2235, 2269, 2304, 2336, 2370, 2786, 3298, 3505, 3818, 3832, 4956, 4966, 5002, 5013, 5031, 5132, 5298, 5692, 5695, 5762, 6316, 6715, 7276, 7881, 8132, 8576, 8656, 8662, 8677, 8701, 8723, 9069, 9167, 9224, 9269, 9577, 9637, 9745, 9768, 9986, 10117, 10189, 10465, 13612, 13644, 13654, 13679, 13712, 15111, 15333, 15410, 15640, 15910, 16296, 16543, 16576, 16667, 16679, 16686, 16718, 16734, 16781, 16814, 16846, 16876, 16906, 16935, 16967, 16973, 17126, 17186, 17219, 17250, 17282, 17294, 17314, 17344, 17374, 17403, 17432, 17461, 17494, 17537, 17559, 17592, 17628, 17660, 17750, 17909, 17935, 18293, 18311, 18695, 18748, 18756, 18799, 27701, 28250, 28258, 28267, 28458, 35267, 35934, 36187, 36221, 39001, 39019, 40086, 40373]


In [None]:
'''What do I want to happen to the duplicates?
-If the index number appears twice then check the kopie tag (0/1).
-Make sure that only the section of the line with the same kopie tag gets added to a fragment, regardless of which order the
same line segments appear in.

This needs to be worked into the function somewhere inbetween collect_row_locs and make_collected_lists'''

In [17]:
kopie_locs = list()
for item in locs_kopie:
    kopie_locs.append(item['loc_list'])
flat_list_k = [item for sublist in kopie_locs for item in sublist]
    
no_kopie_locs = list()
for item in locs_no_kopie:
    no_kopie_locs.append(item['loc_list'])
flat_list_nk = [item for sublist in no_kopie_locs for item in sublist]
    
    
in_both1 = list(set(flat_list_k).intersection(flat_list_nk))
in_both2 = list(set(flat_list_nk).intersection(flat_list_k))

print(len(in_both1))
print(len(in_both2))
assert in_both1 == in_both2
print(len(set(in_both1)))

242
242
242


In [18]:
print(in_both1)

[10241, 10242, 8716, 8717, 17431, 16920, 16921, 17432, 15393, 15394, 8741, 5158, 5159, 8742, 40492, 40493, 35379, 9268, 9269, 566, 567, 16951, 16952, 17462, 17463, 8764, 8765, 9793, 9794, 72, 73, 17492, 17493, 16982, 16983, 5720, 5721, 90, 91, 5724, 5725, 6747, 6748, 9817, 9818, 9314, 9315, 15973, 15974, 15471, 15472, 17522, 17523, 17012, 17013, 18035, 18036, 7309, 7310, 17552, 17553, 17045, 17046, 17052, 17053, 5792, 5793, 27808, 2211, 2212, 27809, 17586, 17587, 701, 702, 2246, 2247, 28358, 28359, 6347, 6348, 5325, 5326, 28367, 28368, 36047, 36048, 39117, 39118, 28377, 28378, 17630, 17631, 16608, 16609, 39136, 39137, 2281, 2282, 7915, 7916, 2802, 2803, 3315, 3316, 17653, 17654, 3837, 3838, 16642, 16643, 3852, 2317, 2318, 3853, 40204, 40205, 10518, 10519, 17687, 17688, 2350, 2351, 10036, 10037, 17206, 17207, 17724, 17725, 15170, 15171, 331, 332, 2385, 2386, 15702, 15703, 35380, 17757, 16734, 16735, 17758, 353, 354, 13666, 13667, 360, 361, 16747, 16748, 18798, 18799, 4977, 4978, 16755, 

In [19]:
print(locs_kopie)

[{'index_range': (73, 90), 'loc_list': [72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91]}, {'index_range': (235, 235), 'loc_list': [236]}, {'index_range': (238, 330), 'loc_list': [239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332]}, {'index_range': (351, 357), 'loc_list': [353, 354, 355, 356, 357, 358, 359, 360, 361]}, {'index_range': (428, 445), 'loc_list': [432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451]}, {'index_range': (450, 487), 'loc_list': [456, 457, 458, 459, 460, 461, 462, 463, 464, 

In [20]:
kopie_indices_list = list()
for item in locs_kopie:
    kopie_indices_list.append(item['loc_list'])
flat_kopie_index = [item for sublist in kopie_indices_list for item in sublist]

no_kopie_indices_list = list()
for item in locs_no_kopie:
    no_kopie_indices_list.append(item['loc_list'])
flat_no_kopie_index = [item for sublist in no_kopie_indices_list for item in sublist]

#print(len(flat_kopie_index))
#print(len(flat_no_kopie_index))

in_both_index = list(set(flat_kopie_index).intersection(flat_no_kopie_index))

print(len(in_both_index))
print(in_both_index)

242
[10241, 10242, 8716, 8717, 17431, 16920, 16921, 17432, 15393, 15394, 8741, 5158, 5159, 8742, 40492, 40493, 35379, 9268, 9269, 566, 567, 16951, 16952, 17462, 17463, 8764, 8765, 9793, 9794, 72, 73, 17492, 17493, 16982, 16983, 5720, 5721, 90, 91, 5724, 5725, 6747, 6748, 9817, 9818, 9314, 9315, 15973, 15974, 15471, 15472, 17522, 17523, 17012, 17013, 18035, 18036, 7309, 7310, 17552, 17553, 17045, 17046, 17052, 17053, 5792, 5793, 27808, 2211, 2212, 27809, 17586, 17587, 701, 702, 2246, 2247, 28358, 28359, 6347, 6348, 5325, 5326, 28367, 28368, 36047, 36048, 39117, 39118, 28377, 28378, 17630, 17631, 16608, 16609, 39136, 39137, 2281, 2282, 7915, 7916, 2802, 2803, 3315, 3316, 17653, 17654, 3837, 3838, 16642, 16643, 3852, 2317, 2318, 3853, 40204, 40205, 10518, 10519, 17687, 17688, 2350, 2351, 10036, 10037, 17206, 17207, 17724, 17725, 15170, 15171, 331, 332, 2385, 2386, 15702, 15703, 35380, 17757, 16734, 16735, 17758, 353, 354, 13666, 13667, 360, 361, 16747, 16748, 18798, 18799, 4977, 4978, 167

In [21]:
assert 353 and 354 in in_both_index

In [None]:
'''Makes sense that there are 242 because there are 121 which are split into two lines e.g. 121x2=242 so the numbers line up'''

In [None]:
'''Use the in_both_index list to make an exception route in the function kind of like you did with diff_dict to split lines.
OR also try adding a check of the kopie tag in new_df when making collected lists (could work out to be simpler).'''

In [107]:
'''unboxed make_collected_list: trying to add a check for kopie tag in new_df'''

tag= 'no_kopie'
locs_list = locs_no_kopie

def new_collected_lists(new_df, locs_list, tag = 'kopie'):

    main_list = list()
    for group_dict in locs_list:
        temp_dict = dict()

        if tag == 'kopie':
            temp_dict['kopie'] = 1
            kopie_df = new_df.loc[group_dict['loc_list']]
            filtered = kopie_df.loc[kopie_df['kopie']==1]
            group_text = list(filtered['text'])
            temp_dict['text'] = " ".join(group_text) 

        else:
            temp_dict['kopie'] = 0
            no_kopie_df = new_df.loc[group_dict['loc_list']]
            filtered = no_kopie_df.loc[no_kopie_df['kopie']==0]
            group_text= list(filtered['text'])
            temp_dict['text'] = " ".join(group_text) 

        temp_dict['index_range'] = group_dict['index_range']

        main_list.append(temp_dict)

    return main_list

In [109]:
'''UNBOXED MAIN FUNCTION: trying out the new_collected_lists function '''

bs_content = make_soup("C:/Users/Ellie/Documents/MASTERS/NIAA/new_allie/1791_Purm_Louw_kopie_tei.xml")
    
dict_list = indexed_kopie_line_dict(bs_content)
    
new_df = pd.DataFrame(dict_list)
    
index_groups_kopie = get_index_groups(new_df)[0]
index_groups_no_kopie = get_index_groups(new_df)[1]
    
locs_kopie = collect_row_locs(index_groups_kopie, new_df)
locs_no_kopie = collect_row_locs(index_groups_no_kopie, new_df)
    
kopie_list = new_collected_lists(new_df, locs_kopie, tag= 'kopie')
no_kopie_list = new_collected_lists(new_df, locs_no_kopie, tag= 'no_kopie')
    
final_df = make_final_df(kopie_list, no_kopie_list)

In [110]:
final_df[5:10]

Unnamed: 0,index_range,kopie,text
2,"(238, 330)",1,Omtrent het Jaer 1400 onder de Re¬ geringe van...
3,"(330, 351)",0,//2 Dus ziet men uijt deze voortgebragte bewi...
3,"(351, 357)",1,"""Meermin, Sijreene, is een ingebeeld dier, dat..."
4,"(357, 428)",0,"e att;//3zoodh is de Gedaente, en de zeldza..."
4,"(428, 445)",1,"""Men vind in¬ de Kronijk van Schotland /10/ aa..."


In [115]:
print(final_df.iloc[6]['text'])
print()
print(data.iloc[6]['text'])

 //2 Dus ziet men uijt deze voortgebragte bewijzen  dat er, een Wijf, Vrouwe, of Meermin in de Purmer gevangen is, en dat het zeker is dat het geene Natuurelyke Vrouwe is geweest blijkt daar uijt, om dat 'Smenschen natuur niet geschikt is in 't Waetter maar op aarde te leven daar het Water het Element der Meerminnen of der@@@ is /1/ zie H: Soeteboom in zijne Korte Kron: van Purmerend op bladz:, 77 tot 80; /2/ zie Tafereel van Natuur en Konst 13de deel, bladz: 63 /3/ Mr Jan Leoninus te Edam is zijn gevoelen, dat dit schep¬ zel was een wild vrouwperzoon dat stom was, en ge¬ woon was in de ruygte en biezen van de Purmermeer zig te onthouden, geweest is; zie verhaal van de Vuur en Watenood door Dne Kruijthof en Leoninus pag: 4-7; Maer waerom zoo veel deze Meerminnen gemeld, indien het zeker is, dat dezelve maar door inbeeldinge werd voor gestelt, en dat erin de Weereld, zoodanig een Schepzel niet werd gevonden, Een zeker Voornaam Schreijver, meld hhieeeeerr    oovvtlgnd

gebragt;  //2 Dus 

In [116]:
print(data.iloc[7]['text'])
print()
print(final_df.iloc[7]['text'])

"Meermin, Sijreene, hhieeeeerr    oovvtlgnd is een ingebeeld dier, dat van ondere Visch, en van boven Mensch zoude zijn, Bij oude schrijvers vind men veelvuldige Wonderen van dat niet in wezen zijnde Schepzel te boek gesteld, alle te belaggelijk om hier eene plaets te beslaen" e    att;//3zoodh

"Meermin, Sijreene, is een ingebeeld dier, dat van ondere Visch, en van boven Mensch zoude zijn, Bij oude schrijvers vind men veelvuldige Wonderen van dat niet in wezen zijnde Schepzel te boek gesteld, alle te belaggelijk om hier eene plaets te beslaen"


In [122]:
new_df[910:920]

Unnamed: 0,index,kopie,text
910,900,1,vatte op dit moment vuur het Ste#
911,901,1,"ging af, met den Stamperop dezelve"
912,902,1,en Kweste bij deze Jongelingen
913,903,1,de eene genaamt Douwe Van der W#
914,904,1,"zijnde eene Frieser van Geboorte, #"
915,905,0,
916,906,1,een Wieledrajers Baas in deze Stad
917,907,0,
918,908,1,"oud 19 Jaeren, werden door de St#"
919,909,1,per bijde zijne Handen weg ge#


In [None]:
'''The new version of collected lists works!!!! Use the function new_collected_lists from now on and replace in other versions'''

In [None]:
'''Working on 3) There are empty lines in the XML, which results in two chunks of copied text, interrupted by a chunk 
of non-copied text, but that chunk is in fact an empty line. (See for example line 904, 906, 908 in the df.)'''

In [128]:
new_df.iloc[915]['text']

''

In [179]:
no_empty_lines = new_df.loc[new_df['text'] != '']

In [137]:
no_empty_lines[910:920]

Unnamed: 0,index,kopie,text
913,903,1,de eene genaamt Douwe Van der W#
914,904,1,"zijnde eene Frieser van Geboorte, #"
916,906,1,een Wieledrajers Baas in deze Stad
918,908,1,"oud 19 Jaeren, werden door de St#"
919,909,1,per bijde zijne Handen weg ge#
920,910,1,en door het Buskruijd werd zijn #
922,912,1,"gezigt geschroeijd en gebrand, zoo"
923,913,1,"bij zijn Oogen uijt waeren, en het zel,,"
924,914,1,"ve onkenbaar was, en had mede de v#"
925,915,1,S


In [None]:
'''Got rid of all the empty lines! this should be insterted into the function which generates new_df'''

In [None]:
'''Working on 4) Once the above issues are fixed, the hyphens (¬) need to be removed from the chunks. 
(There is a line of code for that in my script parse_xml, in the repo I mailed you the link of yesterday.) AND 5) It will 
probably be best to remove all punctuation (but not before the lines are indexed!). (Again, code for that is in my 
script parse_xml.)'''

In [None]:
'''I want to iterate through the text column and remove all unwanted characters from the strings.'''

In [172]:

for row in no_empty_lines['text'][20:30]:
    no_hyphen = row.replace('¬', '')
    #print(no_hyphen)
    no_punc = re.sub(r'[^\w\s]','',no_hyphen)
    print(no_punc)


nen deze stad voorgevallen dienen Mijne
gedagten waren om het bij het Eerste deel
der kronijk of geschiedenisse dezer stadt
het te laeten berusten maar de voornaeme
en buijtengewoone voorvallen dewelke in deze
korte tijdkring alhier zijn gebeurt heb
ben mijn verpligt om de pen weder op te
vatten om daer door geduurig eene aan
genaeme nagedagtenisse in ons geheu
gen te doen herleeven Het voornaems


In [176]:
for i, row in no_empty_lines.iterrows():
    text = row['text']
    no_hyphen = text.replace('¬', '')
    no_punc = re.sub(r'[^\w\s]','',no_hyphen)
    no_empty_lines.at[i,'text'] = no_punc

In [177]:
no_empty_lines['text'][20:30]

20         nen deze stad voorgevallen dienen Mijne
21       gedagten waren om het bij het Eerste deel
22       der kronijk of geschiedenisse dezer stadt
23        het te laeten berusten maar de voornaeme
24    en buijtengewoone voorvallen dewelke in deze
25         korte tijdkring alhier zijn gebeurt heb
26         ben mijn verpligt om de pen weder op te
27           vatten om daer door geduurig eene aan
28             genaeme nagedagtenisse in ons geheu
29             gen te doen herleeven Het voornaems
Name: text, dtype: object

In [None]:
'''This now works to remove all hyphens and punctuation in the texts!!! This should be inserted when new_df is created'''