In [24]:
import numpy as np, pandas as pd
from collections import Counter
%run ./parsing.ipynb

In [25]:
# vectorization

rows = []
for doc, toks in tokens_by_book.items():
    for term, cnt in Counter(toks).items():
        rows.append((doc, term, cnt))
counts = pd.DataFrame(rows, columns=["doc","term","count"])

tf = counts.pivot_table(index="doc", columns="term", values="count", fill_value=0).astype(float)
tf = tf.div(tf.sum(axis=1).replace(0,1), axis=0)
tf.shape
# (11 books, 28668 vocabulary)

(11, 28668)

In [26]:
# IDF (smoothed): idf(t) = log( N / (1 + df(t)) )
N  = tf.shape[0]
df = (tf > 0).sum(axis=0).astype(float)
idf = np.log(N / (1.0 + df))

# TF-IDF
tfidf = tf.mul(idf, axis=1)
tfidf.shape

(11, 28668)

In [28]:
topics = {
    "romance": ["love","marriage","sister","heart"],
    "monster": ["monster","creature","horror","fear"],
    "sea": ["whale","sea","ship","captain"],
}
def topic_scores(tfidf_df, topic_terms):
    cols = [t for t in topic_terms if t in tfidf_df.columns]
    return tfidf_df[cols].mean(axis=1) if cols else pd.Series(0.0, index=tfidf_df.index)

for name, terms in topics.items():
    print(f"\nTop for topic: {name}")
    print(topic_scores(tfidf, terms).sort_values(ascending=False).head(5))



Top for topic: romance
doc
Alice's Adventures in Wonderland    -0.000011
Moby Dick; Or, The Whale            -0.000012
Beowulf- An Anglo-Saxon Epic Poem   -0.000012
Grimms' Fairy Tales                 -0.000023
Dracula                             -0.000031
dtype: float64

Top for topic: monster
doc
Beowulf- An Anglo-Saxon Epic Poem          0.000092
Frankenstein; Or, The Modern Prometheus    0.000039
Moby Dick; Or, The Whale                   0.000016
Grimms' Fairy Tales                       -0.000002
Nora's twin sister                        -0.000002
dtype: float64

Top for topic: sea
doc
Moby Dick; Or, The Whale                   0.002008
Dracula                                    0.000028
Nora's twin sister                         0.000019
Grimms' Fairy Tales                        0.000005
Frankenstein; Or, The Modern Prometheus    0.000004
dtype: float64


In [32]:
import numpy as np, pandas as pd
from collections import Counter

# tokens_by_book: {doc: [w1, w2, ...]}

# 1) Long-form counts
rows = []
for doc, toks in tokens_by_book.items():
    for term, cnt in Counter(toks).items():
        rows.append((doc, term, cnt))
counts = pd.DataFrame(rows, columns=["doc","term","count"])

print(counts)

                                            doc          term  count
0                      Moby Dick; Or, The Whale      pictures     15
1                      Moby Dick; Or, The Whale            of   6598
2                      Moby Dick; Or, The Whale       whaling    131
3                      Moby Dick; Or, The Whale        scenes     12
4                      Moby Dick; Or, The Whale         stone     20
...                                         ...           ...    ...
80513  Little Women; Or, Meg, Jo, Beth, and Amy          item      1
80514  Little Women; Or, Meg, Jo, Beth, and Amy          html      1
80515  Little Women; Or, Meg, Jo, Beth, and Amy       listing      1
80516  Little Women; Or, Meg, Jo, Beth, and Amy    originally      1
80517  Little Women; Or, Meg, Jo, Beth, and Amy  subsequently      1

[80518 rows x 3 columns]
