In [21]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re, os, glob
from pathlib import Path

In [52]:
WORDS_ONLY = re.compile(r"[a-z]+") # storing only words cuz it's more convenient to analyze with words only

def load_text(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read().replace("\r\n","\n").replace("\r","\n") # replace \r\n with \n and then replace \r with \n

#def load_docs(pattern="../data/*.txt"): # all of my files end w .txt in data
    #return {Path(p).stem: load_text(p) for p in glob.glob(pattern)}

docs = {Path(p).stem: load_text(p) for p in glob.glob("../data/*.txt")}
tokens = {name: words_only(txt) for name, txt in docs.items()}

In [65]:
START = re.compile(r"\*\*\*\s*START OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*?\*\*\*", re.I|re.S) # strip header
END   = re.compile(r"\*\*\*\s*END OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*", re.I|re.S) # strip footer

# ivxlcdm is normal AND roman numbers
RAW_CONTENT = re.compile(r"^(?:preface|chapter\s+(?:\d+|[ivxlcdm]+))\b", flags=re.I | re.M) 

# most books start with preface or chapter x
HEAD_START = re.compile(r"^\s*(chapter\b|contents\b|epilogue\b|preface\b|prologue\b|etymology\b)", re.I)

In [54]:
def strip_content(text): # strip the gutenberg start and end points
    text = text.replace("\r\n","\n").replace("\r","\n")
    detect = START.search(text)
    if detect: 
        text = text[detect.end():]
    detect = END.search(text)
    if detect: 
        text = text[:detect.start()]
    return text.strip()

In [55]:
def starting_point(text): # start at preface or chapter sth sth line
    detect = RAW_CONTENT.search(text)
    if detect:
        lineStart = text.rfind("\n", 0, detect.start()) + 1 # teleport to the start of the line
        return text[lineStart:].lstrip() # if cant find the keyword preface or chapter, return to the original
    return text

In [56]:
def additional_removals(text):
    keptWords = []
    for line in text.splitlines():
        if HEAD_START.match(line.strip()):
            continue
        keptWords.append(line)
    return re.sub(r"\n{3,}", "\n\n", "\n".join(keptWords)).strip()

In [57]:
def remove_transcriber(text):
    bigNO = ("transcriber", "transcriber's note", "proofreading", "pgdp", "proofreaders", "illustration", "copyright", "'", "ll", "mr", "mrs", "dr")
    return "\n".join(line for line in text.splitlines()
                     if not any(b in line.lower() for b in bigNO))

In [61]:
BUZZWORDS = {"chapter","chap","book","preface","contents","page",
             "project","gutenberg","ebook","transcriber","pgdp", "illustration", "copyright", "'", "ll", "mr", "mrs", "dr"}
def tokenize(text): # basic tokenization of the body explained in the handout
    toks = WORDS_ONLY.findall(text.lower())
    return [t for t in toks if t not in BUZZWORDS]

In [62]:
# cleaning data
def clean(unclean):
    content = strip_content(unclean)
    content = starting_point(content)  # drop everything before preface OR chapter cuz we dont want them
    content = additional_removals(content)
    content = remove_transcriber(content)
    toks = tokenize(content)
    return content, toks

In [63]:
# sorting words for each book w dictionary
pattern = "../data/*.txt"
tokens_per_book = {}
content_per_book = {}

for p in glob.glob(pattern):
    name = Path(p).stem
    unclean = load_text(p)
    content, toks = clean(unclean)
    content_per_book = content
    tokens_per_book[name] = toks

book_names  = sorted(tokens_per_book.keys())
token_lists = [tokens_per_book[n] for n in book_names]