In [107]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re, os, glob
from pathlib import Path

In [155]:
WORDS_ONLY = re.compile(r"[a-z]+") # storing only words cuz it's more convenient to analyze with words only

def load_text(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read().replace("\r\n","\n").replace("\r","\n") # replace \r\n with \n and then replace \r with \n

def load_docs(pattern="../data/*.txt"): # all of my files begin end w .txt in data
    return {Path(p).stem: load_text(p) for p in glob.glob(pattern)}

docs = {Path(p).stem: load_text(p) for p in glob.glob("../data/*.txt")}
tokens = {name: words_only(txt) for name, txt in docs.items()}

def words_only(text: str):
    text = text.lower()
    return WORDS_ONLY.findall(text)

In [220]:
START = re.compile(r"\*\*\*\s*START OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*?\*\*\*", re.I|re.S) # strip header
END   = re.compile(r"\*\*\*\s*END OF (?:THE|THIS) PROJECT GUTENBERG EBOOK.*", re.I|re.S) # strip footer
TOKEN  = re.compile(r"[a-z']+") # tokenization 1st step
RAW_CONTENT = re.compile(
    r"^(?:preface|chapter\s+(?:\d+|[ivxlcdm]+))\b", # ivxlcdm is normal AND roman numbers
    flags=re.I | re.M
) # most books start with preface or chapter x
HEAD_START = re.compile(
    r"^\s*(chapter\b|contents\b|epilogue\b|preface\b|prologue\b|etymology\b)",
    re.I
) #

In [221]:
def strip_content(t: str) -> str: # strip the gutenberg start and end points
    t = t.replace("\r\n","\n").replace("\r","\n")
    m = START.search(t)
    if m: 
        t = t[m.end():]
    m = END.search(t)
    if m: 
        t = t[:m.start()]
    return t.strip()

In [222]:
def starting_point(t: str) -> str: # start at preface or chapter sth sth line
    m = RAW_CONTENT.search(t)
    if m:
        line_start = t.rfind("\n", 0, m.start()) + 1 # teleport to the start of the line
        return t[line_start:].lstrip() # if cant find the keyword preface or chapter, return to the original
    return t

In [223]:
def additional_removals(text: str) -> str:
    kept = []
    for ln in text.splitlines():
        if HEAD_START.match(ln.strip()):
            continue
        kept.append(ln)
    return re.sub(r"\n{3,}", "\n\n", "\n".join(kept)).strip()

In [224]:
ARTIFACTS = {"chapter","chap","book","preface","contents","page",
             "project","gutenberg","ebook","transcriber","pgdp", "illustration", "copyright", "'"}
tokens = [t for t in tokens if t not in ARTIFACTS]
def remove_transcriber(text: str) -> str:
    bigNO = ("transcriber", "transcriber's note", "proofreading", "pgdp", "proofreaders", "illustration", "copyright", "'")
    return "\n".join(ln for ln in text.splitlines()
                     if not any(b in ln.lower() for b in bigNO))

In [225]:
def tokenize(body: str): # basic tokenization of the body explained in the handout
    body = body.lower()
    body = re.sub(r"[^a-z\s']", " ", body)
    return TOKEN.findall(body)

In [226]:
# cleaning data
def clean(raw: str):
    body = strip_content(raw)
    body = starting_point(body)  # drop everything before preface OR chapter cuz we dont want them
    body = additional_removals(body)
    body = remove_transcriber(body)
    toks = tokenize(body)
    return body, toks

In [227]:
# sorting words for each book by dictionary
pattern = "../data/*.txt"
tokens_by_book = {}
bodies_by_book = {}

for p in glob.glob(pattern):
    name = Path(p).stem
    raw = load_text(p)
    body, toks = clean(raw)
    bodies_by_book[name] = body
    tokens_by_book[name] = toks

book_names  = sorted(tokens_by_book.keys())
token_lists = [tokens_by_book[n] for n in book_names]

In [228]:
import numpy as np
import pandas as pd
from collections import Counter

# tokens_by_book: {doc: [w1, w2, ...]}
# build TF
rows = []
for doc, toks in tokens_by_book.items():
    for term, cnt in Counter(toks).items():
        rows.append((doc, term, cnt))
counts = pd.DataFrame(rows, columns=["doc","term","count"])

tf = counts.pivot_table(index="doc", columns="term", values="count", fill_value=0).astype(float)
tf = tf.div(tf.sum(axis=1).replace(0,1), axis=0)

# IDF (smoothed)
N  = tf.shape[0]
df = (tf > 0).sum(axis=0).astype(float)
idf = np.log(N / (1.0 + df))

tfidf = tf.mul(idf, axis=1)  # <-- your model uses ALL terms (names included)

In [229]:
def print_top_terms(tfidf_df, df_series, topk=10, min_df=1, max_df=None, must_include=None, must_exclude=None):
    """
    Only for display: choose terms by doc-frequency band, optionally exclude some tokens.
    Data in tfidf_df remains unchanged.
    """
    mask = (df_series >= min_df)
    if max_df is not None:
        mask &= (df_series <= max_df)
    terms = df_series.index[mask]

    if must_exclude:
        terms = [t for t in terms if t not in must_exclude]
    if must_include:  # ensure certain topic terms always allowed
        terms = sorted(set(terms) | set(must_include))

    for doc in tfidf_df.index:
        s = tfidf_df.loc[doc, terms].nlargest(topk)
        print(f"\n=== {doc} ===")
        for term, score in s.items():
            print(f"{term:20s} {score:.5f}")


In [230]:
print_top_terms(tfidf, df, topk=10, min_df=2)



=== Alice's Adventures in Wonderland ===
alice                0.01889
duchess              0.00200
turtle               0.00170
rabbit               0.00147
caterpillar          0.00133
mock                 0.00093
hare                 0.00089
jury                 0.00081
mouse                0.00073
dodo                 0.00062

=== Beowulf- An Anglo-Saxon Epic Poem ===
folk                 0.00192
armor                0.00123
battle               0.00121
danes                0.00120
heroes               0.00118
warriors             0.00110
sword                0.00110
atheling             0.00104
dragon               0.00103
thou                 0.00095

=== Dracula ===
van                  0.00256
lucy                 0.00238
jonathan             0.00160
arthur               0.00117
diary                0.00079
professor            0.00065
morris               0.00062
dr                   0.00049
whilst               0.00038
wolves               0.00037

=== Frankenstein; Or, The M