In [107]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re, os, glob
from pathlib import Path

In [155]:
WORDS_ONLY = re.compile(r"[a-z]+") # storing only words cuz it's more convenient to analyze with words only

def load_text(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read().replace("\r\n","\n").replace("\r","\n") # replace \r\n with \n and then replace \r with \n

def load_docs(pattern="../data/*.txt"): # all of my files begin end w .txt in data
    return {Path(p).stem: load_text(p) for p in glob.glob(pattern)}

docs = {Path(p).stem: load_text(p) for p in glob.glob("../data/*.txt")}
tokens = {name: words_only(txt) for name, txt in docs.items()}

def words_only(text: str):
    text = text.lower()
    return WORDS_ONLY.findall(text)

In [156]:
START = re.compile(r"\*\*\*\s*START OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*?\*\*\*", re.I|re.S) # strip header
END   = re.compile(r"\*\*\*\s*END OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*", re.I|re.S) # strip footer
TOKEN  = re.compile(r"[a-z0-9']+") # tokenization 1st step
RAW_CONTENT = re.compile(
    r"^(?:preface|chapter\s+(?:\d+|[ivxlcdm]+))\b", # ivxlcdm is normal AND roman numbers
    flags=re.I | re.M
) # most books start with preface or chapter x
HEAD_START = re.compile(
    r"^\s*(chapter\b|contents\b|epilogue\b|preface\b|prologue\b|etymology\b)",
    re.I
) #

In [157]:
def strip_content(t: str) -> str: # strip the gutenberg start and end points
    t = t.replace("\r\n","\n").replace("\r","\n")
    m = START.search(t)
    if m: 
        t = t[m.end():]
    m = END.search(t)
    if m: 
        t = t[:m.start()]
    return t.strip()

In [158]:
def starting_point(t: str) -> str: # start at preface or chapter sth sth line
    m = RAW_CONTENT.search(t)
    if m:
        line_start = t.rfind("\n", 0, m.start()) + 1 # teleport to the start of the line
        return t[line_start:].lstrip() # if cant find the keyword preface or chapter, return to the original
    return t

In [159]:
def additional_removals(text: str) -> str:
    kept = []
    for ln in text.splitlines():
        if HEAD_START.match(ln.strip()):
            continue
        kept.append(ln)
    return re.sub(r"\n{3,}", "\n\n", "\n".join(kept)).strip()

In [183]:
ARTIFACTS = {"chapter","chap","book","preface","contents","page",
             "project","gutenberg","ebook","transcriber","pgdp"}
tokens = [t for t in tokens if t not in ARTIFACTS]
def remove_transcriber(text: str) -> str:
    bigNO = ("transcriber", "transcriber's note", "proofreading", "pgdp", "proofreaders")
    return "\n".join(ln for ln in text.splitlines()
                     if not any(b in ln.lower() for b in bigNO))

In [184]:
def tokenize(body: str): # basic tokenization of the body explained in the handout
    body = body.lower()
    body = re.sub(r"[^a-z0-9\s']", " ", body)
    return TOKEN.findall(body)

In [185]:
# cleaning data
def clean(raw: str):
    body = strip_content(raw)
    body = starting_point(body)  # drop everything before preface OR chapter cuz we dont want them
    body = additional_removals(body)
    body = remove_transcriber(body)
    toks = tokenize(body)
    return body, toks

In [186]:
# sorting words for each book by dictionary
pattern = "../data/*.txt"
tokens_by_book = {}
bodies_by_book = {}

for p in glob.glob(pattern):
    name = Path(p).stem
    raw = load_text(p)
    body, toks = clean(raw)
    bodies_by_book[name] = body
    tokens_by_book[name] = toks

book_names  = sorted(tokens_by_book.keys())
token_lists = [tokens_by_book[n] for n in book_names]