## Women Writing Africa, Data Mining Process

In [2]:
import re
import os

import numpy as np
import utils as ut
import pandas as pd

from pathlib import Path
from importlib import reload

- Convert PDF to txt document

In [3]:
pdf_path = Path("Datasets") / "women writing africa eastern region.pdf"
out_txt_path = Path("Datasets") / "women writing africa eastern region.txt"

In [None]:
# Only needs to be run once
reload(ut)
ut.pdftotxt(pdf_path, out_txt_path, progress_updates=True)

## Parse txt document

Goals:
- Identify content pattern, ex: (Author Name -> Title -> Country, Year, Language) precedes all content blocks
- Extract individual documents, including forewards
- Devise method for removing forwards
- Create a document database of Metadata: Content

Extraction pattern examples:

```
1.
Siti binti Saad
FOUR SONGS
Tanzania 1920s Kiswahili

2.
Nellie Grant

LETTERS FROM AFRICA TO A DAUGHTER
IN ENGLAND

Kenya 1939-1963 English
```

In [4]:
reload(ut)
with open(out_txt_path) as f:
    lines = f.readlines()

country_list = ["Tanzania", "Kenya", "Malawi", "Zambia", "Uganda", "Swaziland"]

start_phrase = ["Sultan Fatima binti Muhammad Mkubwa\n", "PEACE AND SECURITY\n", "Tanzania 1711 Kiswahili\n"]
end_phrase = ["CONTRIBUTORS\n", "\n", "EDITORS\n", "\n"]

for i in range(len(lines)-3):
    if lines[i:i+3] == start_phrase:
        print("Start Index is", i)
        start_index = i
    if lines[i:i+4] == end_phrase:
        print("End Index is", i)
        end_index = i
lines = lines[start_index - 10:end_index-3]

marker_string_indices = []
for i in range(len(lines)-2):
        
    # Checks to see if string contains a year between 1700 and 2099
    year_check = ut.contains_year(lines[i], year_min="1600", year_max="2099")
    
    if year_check:
        
        # Checks to see if string contains an approved country name
        contains_country = []
        for country in country_list:
            if country in lines[i]:
                contains_country.append(True)
            else:
                contains_country.append(False)

        if any(contains_country):
            
            # Checks to make sure string has appropriate number of words
            if 2 < len(lines[i].split()) < 6:
                marker_string_indices.append(i)

Start Index is 4535
End Index is 23579


## Parse into Header, Chunk Foreward, and Chunk Content

In [5]:
reload(ut)
with open(out_txt_path) as f:
    lines = f.readlines()

country_list = ["Tanzania", "Kenya", "Malawi", "Zambia", "Uganda", "Swaziland"]

start_phrase = ["Sultan Fatima binti Muhammad Mkubwa\n", "PEACE AND SECURITY\n", "Tanzania 1711 Kiswahili\n"]
end_phrase = ["CONTRIBUTORS\n", "\n", "EDITORS\n", "\n"]

for i in range(len(lines)-3):
    if lines[i:i+3] == start_phrase:
        print("Start Index is", i)
        start_index = i
    if lines[i:i+4] == end_phrase:
        print("End Index is", i)
        end_index = i

lines = lines[start_index - 5:end_index-3]
for key, val in enumerate(lines):
    if "\n" in val and len(val) < 6:
        lines[key] = "\n"

marker_string_indices = []
for i in range(len(lines)-2):
        
    # Checks to see if string contains a year between 1600 and 2099
    year_check = ut.contains_year(lines[i], year_min="1600", year_max="2099")
    
    if year_check:
        
        # Checks to see if string contains an approved country name
        contains_country = []
        for country in country_list:
            if country in lines[i]:
                contains_country.append(True)
            else:
                contains_country.append(False)

        if any(contains_country):
            
            # Checks to make sure string has appropriate number of words
            if 2 < len(lines[i].split()) < 6:
                marker_string_indices.append(i)

headers = []
chunk_forewards = []
chunk_contents = []
marker_string_indices = marker_string_indices
for key, index in enumerate(marker_string_indices):
    
    header_group = lines[index-5:index+1]
    header_group = [i if not(" + " in i) else "\n" for i in header_group]
    
    header_group.reverse()

    temp = [header_group[0]]
    slashn_reached = 0
    items_added = 1

    header_group = ut.header_parse(header_group)
    header_group.reverse()
    headers.append(header_group)

    if key == len(marker_string_indices) - 1:
        chunk = lines[index+1:]
    else:
        chunk = lines[index+1:marker_string_indices[key+1]+1]

    chunk_foreward, chunk_content = ut.chunk_parse_namebased(chunk)
    if chunk_foreward == "FAILURE_namebased":
        chunk_foreward, chunk_content = ut.chunk_parse_newlinebased(chunk)
        chunk_forewards.append(chunk_foreward)
        chunk_contents.append(chunk_content)
    else:
        chunk_forewards.append(chunk_foreward)
        chunk_contents.append(chunk_content)
    



Start Index is 4535
End Index is 23579


In [6]:
from IPython.display import display
temp = np.array([headers, chunk_forewards, chunk_contents], dtype=object).T
df = pd.DataFrame(temp, columns=["headers", "chunk_forewards", "chunk_contents"])
display(df)

Unnamed: 0,headers,chunk_forewards,chunk_contents
0,"[Sultan Fatima binti Muhammad Mkubwa\n, PEACE ...","[\n, Sultan Fatima binti Muhammad Mkubwa was t...","[\n, PEACE AND SECURITY ¢ 71\n, \n, \n, \n, In..."
1,"[Mwana Kupona binti Msham\n, From A MOTHER’S A...","[\n, Mwana Kupona binti Msham was born in 1810...","[\n, \n, \n, 1. Come near, my dear daughter,\n..."
2,"[Emily Ruete, also known as\n, Princess Salma ...","[\n, —- Cd aa a ET i Ng\n, : seo ~— Tt = 1m\n,...","[\n, \n, \n, \n, As long as the child does not..."
3,"[Martha Thabi\n, My Gop, WHY HAVE YOU FORSAKEN...","[\n, Martha Thabi was born about 1870 and was ...","[\n, 86 + THE EIGHTEENTH AND NINETEENTH CENTUR..."
4,"[Jessie Nyagondwe\n, LET NoT Your HEART BE TRO...","[\n, Jessie Nyagondwe, the composer of this hy...","[\n, Let not your heart be troubled\n, By all ..."
...,...,...,...
108,"[Ruth Meena\n, THE FEMALE HUSBAND\n, Tanzania ...","[\n, Ruth Meena was born in 1946 and received ...","[\n, spending priorities on men, women, and ch..."
109,"[Martha Qorro\n, LANGUAGE IN TANZANIA\n, - Tan...","[\n, Martha Qorro is a prominent lobbyist for ...","[\n, \n, \n, It has been remarked that the sta..."
110,"[Monica Arac de Nyeko\n, IN THE STARS\n, Ugand...","[Born in 1979, Monica Arac de Nyeko comes from...","[\n, \n, \n, Where does your hope or security ..."
111,"[Margaret Wangut Mwema\n, THE STORY OF WACU\n,...","[\n, In this version of the tale of Wacu, an i...","[\n, \n, \n, Once upon a time there lived a wo..."


In [11]:
from ocrfixr import spellcheck
# for h in headers[0:5]:
#     print(h)

print(spellcheck("1 am a man").fix())

for line in chunk_contents[2][-10:-8]:
    print(line)
    print(spellcheck(line).fix())

1 am a man
wisdom laboriously acquired here, 1 am now better off than the others over

wisdom laboriously acquired here, 1 am now better off than the others over

there. That I have never been moré deceived and swindled than in the time of

there. That I have never been moré deceived and swindled than in the time of

