In [24]:
import numpy as np, pandas as pd
from collections import Counter
%run ./parsing.ipynb

In [34]:
# vectorization = TF only

rows = []
for doc, toks in tokens_by_book.items():
    for term, cnt in Counter(toks).items():
        rows.append((doc, term, cnt))
counts = pd.DataFrame(rows, columns=["doc","term","count"])

tf = counts.pivot_table(index="doc", columns="term", values="count")
tf = tf.div(tf.sum(axis=1).replace(0,1), axis=0)
tf.shape
# (11 books, 28668 vocabulary)

(11, 28668)

In [47]:
# IDF(t) = log(N / (1 + DF(t)))
N  = tf.shape[0]
df = (tf > 0).sum(axis=0)
idf = np.log(N / (1.0 + df))

# TF-IDF calc
tfidf = tf.mul(idf, axis=1)
tfidf.shape

(11, 28668)

In [60]:
topics = { #demo ONLY
    "romance": ["courtship","suitor","betrothal","proposal","engagement","fiancé","fiancée", "dowry","estate","heir",
                "heiress","governess","gentleman","lady", "propriety","decorum","reputation","scandal", 
                "duke","duchess","earl","viscount", "ballroom","carriage","parlor","drawing-room","letter",
                "guardian","ward", "affection","ardor","devotion","fidelity","virtue","duty","yearning"],
    "monster": ["monster","beast","creature","abomination","wretch","specter","phantom",
                "shade","ghost","ghoul","demon","devil","daemon","witch","warlock",
                "vampire","werewolf","lycanthrope","ogre","goblin","harpy","dragon","chimera",
                "kraken","leviathan","curse","hex","eldritch","uncanny","fang","claw"],
    "sea": ["sea","ocean","tide","wave","brine","gale","squall","tempest", "maelstrom","reef",
            "shoal","harbor","lighthouse","beacon","mariner","captain", "ship","deck","mast",
            "sail","rigging","keel","rudder","helm", "cabin","hold","port","starboard","compass","sextant","harpoon","whale"],
    "family": ["father","mother","son","daughter","brother","sister","husband","wife", "uncle","aunt",
                "cousin","niece","nephew","patriarch","matriarch","household", "guardian","ward","heir",
                "heiress","lineage","kin","kinship","bloodline", "inheritance","entail","namesake",
                "orphan","widow","widower","stepfather","stepmother"]

}
def topic_scores(tfidf_df, topic_terms):
    cols = [t for t in topic_terms if t in tfidf_df.columns]
    return tfidf_df[cols].mean(axis=1) if cols else pd.Series(0.0, index=tfidf_df.index)

for name, terms in topics.items():
    print(f"\nTop of topic: {name}")
    print(topic_scores(tfidf, terms).sort_values(ascending=False).head(10))



Top of topic: romance
doc
Alice's Adventures in Wonderland            0.000500
Beowulf- An Anglo-Saxon Epic Poem           0.000071
Sense and Sensibility                       0.000033
Little Women; Or, Meg, Jo, Beth, and Amy    0.000022
Pride and Prejudice                         0.000022
Frankenstein; Or, The Modern Prometheus     0.000017
Moby Dick; Or, The Whale                    0.000009
Grimms' Fairy Tales                         0.000008
Wuthering Heights                           0.000007
Dracula                                     0.000006
dtype: float64

Top of topic: monster
doc
Beowulf- An Anglo-Saxon Epic Poem           0.000204
Moby Dick; Or, The Whale                    0.000066
Frankenstein; Or, The Modern Prometheus     0.000057
Grimms' Fairy Tales                         0.000050
Wuthering Heights                           0.000025
Dracula                                     0.000024
Little Women; Or, Meg, Jo, Beth, and Amy    0.000008
Nora's twin sister            

1. The machine cannot differentiate between synonyms.
2. It has a disanvantage against poems, as they have a complicated structure that even confuse human if they do not analyze them correctly (e.g.: Beowulf gets grouped in romance just because the language is poetic and enigmatic.)
3. In terms of unique first names, TF-I

In [58]:
def top_n(series, n=10): 
    return series.sort_values(ascending=False).head(n)
doc = tf.index[0]
print("TF:", top_n(tf.loc[doc]).to_dict())
print("TFIDF:", top_n(tfidf.loc[doc]).to_dict())

TF: {'the': 0.06015670767428237, 'and': 0.03192735793790275, 'to': 0.026691564147627418, 'a': 0.023140011716461628, 'it': 0.02178529584065612, 'she': 0.020247510251903925, 'i': 0.01991798476859988, 'of': 0.018819566490919742, 'said': 0.016915641476274165, 'you': 0.015048330404217927}
TFIDF: {'alice': 0.018886033417534913, 'hatter': 0.003495382731596068, 'gryphon': 0.0034329651828175676, 'dormouse': 0.0024967019511400487, 'duchess': 0.0019980186487064643, 'turtle': 0.001703243419064585, 'rabbit': 0.001472295158852438, 'caterpillar': 0.0013320124324709762, 'mock': 0.0009267416128299357, 'hare': 0.0008949245083220702}
