In [10]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re, os, glob

In [37]:
def load_text(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read().replace("\r\n","\n").replace("\r","\n")

def load_docs(pattern="data/*.txt"):
    return {Path(p).stem: load_text(p) for p in glob.glob(pattern)}

docs = load_docs("data/*.txt")
print(f"Loaded {len(docs)} files:", list(docs)[:5])

Loaded 0 files: []


In [26]:
START = re.compile(r"\*\*\*\s*START OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*?\*\*\*", re.I|re.S) # strip header
END   = re.compile(r"\*\*\*\s*END OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*", re.I|re.S) # strip footer
TOKEN  = re.compile(r"[a-z0-9']+") # tokenization 1st step
RAW_CONTENT = re.compile(
    r"^(?:preface|chapter\s+(?:\d+|[ivxlcdm]+))\b", # ivxlcdm is normal AND roman numbers
    flags=re.I | re.M
) # most books start with preface or chapter x

In [28]:
def strip_content(t: str) -> str: # strip the gutenberg start and end points
    t = t.replace("\r\n","\n").replace("\r","\n")
    m = START.search(t)
    if m: 
        t = t[m.end():]
    m = END.search(t)
    if m: 
        t = t[:m.start()]
    return t.strip()

In [29]:
def starting_point(t: str) -> str: # start at preface or chapter sth sth line
    m = RAW_CONTENT.search(t)
    if m:
        line_start = t.rfind("\n", 0, m.start()) + 1 # teleport to the start of the line
        return t[line_start:].lstrip() # if cant find the keyword preface or chapter, return to the original
    return t

In [30]:
def tokenize(body: str): # basic tokenization of the body explained in the handout
    body = body.lower()
    body = re.sub(r"[^a-z0-9\s']", " ", body)
    return TOKEN.findall(body)

In [31]:
# cleaning data
def clean(raw: str):
    body = strip_content(raw)
    body = starting_point(body)  # drop everything before preface OR chapter cuz we dont want them
    toks = tokenize(body)
    return body, toks

In [38]:
bodies, tokens = {}, {}
for name, raw in docs.items():
    body, toks = clean(raw)
    bodies[name] = body
    tokens[name] = toks

In [40]:
for name in list(bodies)[:2]:
    print("â€”", name, "| body chars:", len(bodies[name]), "| tokens:", len(tokens[name]))
    print(bodies[name][:200], "...\n")