In [1]:
import os
import nltk 
import string
import sklearn.feature_extraction.text as sklt
import sklearn.cluster as sklc
import numpy as np

In [2]:
# k - means clustering

# os.chdir(os.path.join(os.path.expanduser('~'), 'Downloads/Lautenberg'))
os.chdir(os.path.join(os.path.expanduser('~'), 'Documents/work/teaching/bi_ds/GRA-4153/examples/data/Lautenberg'))

stemmer = nltk.stem.PorterStemmer()

token_dict = {}
def tokenise(text):
    stems = []
    for item in nltk.word_tokenize(text):
        stems.append(stemmer.stem(item))
    return stems

In [3]:
# Run loop to pre-process and get tokens

for fname in os.listdir():
    f = open(fname, "r")
    doc = f.read().strip().replace("Senator Frank R  Lautenberg", "").replace("Press Release        of        Senator Lautenberg", "").replace("Questions or Comments", "").strip().lower()
    token_dict[fname] = doc.translate(str.maketrans('', '', string.punctuation))

token_dict


{'27Sep2007Lautenberg87.txt': 'lautenberg hails senate passage of hate crimes legislation                                                                                                                                                                                                                                                                                                                                                            contact  press office  202  224 3224 thursday  september 27  2007       washington  d c    sen  frank r  lautenberg  d nj  issued the following statement on the senate passage of legislation strengthening the ability of federal and local law enforcement officials to prosecute hate crimes    this legislation sends a powerful message that america does not tolerate violent acts of hatred  it is long overdue and brings us closer to a day when everyone in our country can live without fear of hate based violence',
 '15Mar2007Lautenberg258.txt': 'lautenberg lott p

In [4]:
tfidf = sklt.TfidfVectorizer(tokenizer=tokenise, stop_words='english')    
tfidf

In [5]:
tfs = tfidf.fit_transform(token_dict.values())
tfs




<558x7706 sparse matrix of type '<class 'numpy.float64'>'
	with 83008 stored elements in Compressed Sparse Row format>

In [6]:
vocab = dict((v, k) for k, v in tfidf.vocabulary_.items())
vocab

{4217: 'lautenberg',
 3460: 'hail',
 6272: 'senat',
 5210: 'passag',
 3515: 'hate',
 2174: 'crime',
 4255: 'legisl',
 2047: 'contact',
 5514: 'press',
 5008: 'offic',
 241: '202',
 291: '224',
 389: '3224',
 7019: 'thursday',
 6286: 'septemb',
 334: '27',
 233: '2007',
 7497: 'washington',
 2228: 'd',
 1579: 'c',
 6269: 'sen',
 3185: 'frank',
 5670: 'r',
 4881: 'nj',
 3991: 'issu',
 3124: 'follow',
 6655: 'statement',
 6726: 'strengthen',
 740: 'abil',
 3014: 'feder',
 4354: 'local',
 4219: 'law',
 2758: 'enforc',
 5009: 'offici',
 5597: 'prosecut',
 6997: 'thi',
 6274: 'send',
 5470: 'power',
 4601: 'messag',
 959: 'america',
 2531: 'doe',
 7057: 'toler',
 7424: 'violent',
 792: 'act',
 3516: 'hatr',
 4369: 'long',
 5117: 'overdu',
 1503: 'bring',
 1850: 'closer',
 2254: 'day',
 2867: 'everyon',
 2128: 'countri',
 4342: 'live',
 3007: 'fear',
 1238: 'base',
 7423: 'violenc',
 4382: 'lott',
 5383: 'plan',
 3222: 'fulli',
 3226: 'fund',
 972: 'amtrak',
 3795: 'includ',
 3240: 'fy',
 32:

In [8]:
np.random.seed(45234)

clust = sklc.KMeans(5).fit(tfs)
mu = clust.cluster_centers_
pi = clust.labels_
np.unique(pi, return_counts = True)

def get_top_words(vocab, mu, k, n = 10):
    diff = mu[k, :] - mu[[x for x in range(mu.shape[0]) if x != k], :].mean(axis = 0)   
    return [vocab[x] for x in np.argsort(diff)[-n:]]

for i in range(5):
    print(get_top_words(vocab, mu, i))


['agenc', 'beach', 'test', 'report', 'releas', 'chemic', 'site', 'toxic', 'ringwood', 'epa']
['famili', 'sponsor', 'sen', 'libyan', 'drug', 'medicar', 'militari', 'terror', 'victim', 'libya']
['counti', 'flood', 'airport', '000', 'project', 'jersey', 'fund', 'amtrak', 'new', 'rail']
['judg', 'build', 'warm', 'global', 'energi', 'statement', 'oil', 'presid', 'gun', 'iraq']
['plant', 'chertoff', 'risk', 'screener', 'tsa', 'dh', 'port', 'homeland', 'chemic', 'secur']


In [9]:
# Latent Dirichlet Allocation

from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd

np.random.seed(154)

K = 10
LDA = LatentDirichletAllocation(n_components=K)
lda = LDA.fit(tfs)

beta = pd.DataFrame((lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]).T)
beta.columns = ["topic_" + str(x) for x in range(1, K+1)]

vocab = dict(sorted(vocab.items()))
beta["word"] = vocab.values()

df = pd.wide_to_long(beta, stubnames= "topic_", i = "word", j = "topic").rename({"topic_":"beta"}, axis = 1)

df = df.assign(average_beta = lambda x: x.groupby("word").transform('mean')["beta"],
                diff = lambda x: x["beta"] - x["average_beta"])
    
tbl = pd.DataFrame(df.groupby("topic")["diff"].nlargest(5))
tbl = tbl.droplevel(2, axis = 0)

In [14]:
tbl

Unnamed: 0_level_0,Unnamed: 1_level_0,diff
topic,word,Unnamed: 2_level_1
1,new,0.004666
1,lautenberg,0.004498
1,s,0.004432
1,secur,0.003963
1,jersey,0.003862
2,alito,0.000927
2,nicknam,0.00078
2,isra,0.00075
2,ferri,0.000737
2,pipelin,0.00073
