## Women Writing Africa, Data Mining Process

In [1]:
import re
import os

import numpy as np
import utils as ut
import pandas as pd

from pathlib import Path
from importlib import reload

- Convert PDF to txt document

In [2]:
pdf_path = Path("Datasets") / "women writing africa eastern region.pdf"
out_txt_path = Path("Datasets") / "women writing africa eastern region.txt"

In [None]:
# Only needs to be run once
reload(ut)
ut.pdftotxt(pdf_path, out_txt_path, progress_updates=True)

## Parse txt document

Goals:
- Identify content pattern, ex: (Author Name -> Title -> Country, Year, Language) precedes all content blocks
- Extract individual documents, including forewards
- Devise method for removing forwards
- Create a document database of Metadata: Content

Extraction pattern examples:

```
1.
Siti binti Saad
FOUR SONGS
Tanzania 1920s Kiswahili

2.
Nellie Grant

LETTERS FROM AFRICA TO A DAUGHTER
IN ENGLAND

Kenya 1939-1963 English
```

In [11]:
reload(ut)
with open(out_txt_path) as f:
    lines = f.readlines()

country_list = ["Tanzania", "Kenya", "Malawi", "Zambia", "Uganda", "Swaziland"]

start_phrase = ["Sultan Fatima binti Muhammad Mkubwa\n", "PEACE AND SECURITY\n", "Tanzania 1711 Kiswahili\n"]
end_phrase = ["CONTRIBUTORS\n", "\n", "EDITORS\n", "\n"]

for i in range(len(lines)-3):
    if lines[i:i+3] == start_phrase:
        print("Start Index is", i)
        start_index = i
    if lines[i:i+4] == end_phrase:
        print("End Index is", i)
        end_index = i
lines = lines[start_index - 10:end_index-3]

marker_string_indices = []
for i in range(len(lines)-2):
        
    # Checks to see if string contains a year between 1700 and 2099
    year_check = ut.contains_year(lines[i], year_min="1600", year_max="2099")
    
    if year_check:
        
        # Checks to see if string contains an approved country name
        contains_country = []
        for country in country_list:
            if country in lines[i]:
                contains_country.append(True)
            else:
                contains_country.append(False)

        if any(contains_country):
            
            # Checks to make sure string has appropriate number of words
            if 2 < len(lines[i].split()) < 6:
                marker_string_indices.append(i)

Start Index is 4535
End Index is 23579


## Parse into Header, Chunk Foreward, and Chunk Content

In [14]:
reload(ut)
with open(out_txt_path) as f:
    lines = f.readlines()

country_list = ["Tanzania", "Kenya", "Malawi", "Zambia", "Uganda", "Swaziland"]

start_phrase = ["Sultan Fatima binti Muhammad Mkubwa\n", "PEACE AND SECURITY\n", "Tanzania 1711 Kiswahili\n"]
end_phrase = ["CONTRIBUTORS\n", "\n", "EDITORS\n", "\n"]

for i in range(len(lines)-3):
    if lines[i:i+3] == start_phrase:
        print("Start Index is", i)
        start_index = i
    if lines[i:i+4] == end_phrase:
        print("End Index is", i)
        end_index = i

lines = lines[start_index - 5:end_index-3]
for key, val in enumerate(lines):
    if "\n" in val and len(val) < 6:
        lines[key] = "\n"

marker_string_indices = []
for i in range(len(lines)-2):
        
    # Checks to see if string contains a year between 1700 and 2099
    year_check = ut.contains_year(lines[i], year_min="1600", year_max="2099")
    
    if year_check:
        
        # Checks to see if string contains an approved country name
        contains_country = []
        for country in country_list:
            if country in lines[i]:
                contains_country.append(True)
            else:
                contains_country.append(False)

        if any(contains_country):
            
            # Checks to make sure string has appropriate number of words
            if 2 < len(lines[i].split()) < 6:
                marker_string_indices.append(i)

def header_parse(header_group):
    arr = [0 if i == "\n" else 1 for i in header_group]
    simple_formats = [
        [1,0,1,1,0,1], [1,0,1,0,1,1], [1,0,1,1,1,0], [1,0,1,0,1,0]
    ]
    # Most Common
    if np.sum(arr[0:3]) == 3:
        return header_group[0:3]
    # Simple Formats
    elif any(arr == i for i in simple_formats):
        return [header_group[i] for i in range(len(arr)) if arr[i]==1]
    # More Complex Formats
    elif arr == [1,0,1,1,1,1]: 
        return [header_group[0], header_group[2], header_group[3]]
    elif arr == [1,1,0,1,1,0]: 
        return [header_group[0], header_group[1], header_group[3], header_group[4]]
    else:
        return header_group

headers = []
chunk_forewards = []
chunk_contents = []
marker_string_indices = marker_string_indices
for key, index in enumerate(marker_string_indices):
    #print(lines[index-5:index+5])
    header_group = lines[index-5:index+1]
    header_group = [i if not(" + " in i) else "\n" for i in header_group]
    
    header_group.reverse()

    temp = [header_group[0]]
    slashn_reached = 0
    items_added = 1

    ## Addressing each format individually,
    ## this is messy but works most of the time
    header_group = header_parse(header_group)
    header_group.reverse()
    headers.append(header_group)

    if key == len(marker_string_indices) - 1:
        chunk = lines[index+1:]
    else:
        chunk = lines[index+1:marker_string_indices[key+1]+1]

    try:
        chunk_foreward, chunk_content = ut.chunk_parse_namebased(chunk)
        chunk_forewards.append(chunk_foreward)
        chunk_contents.append(chunk_content)
    except Exception as e:
        print(e)
        print(key)
        print(header_group)
        2+"s"




Start Index is 4535
End Index is 23579
