# Topic Modeling

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.datasets import fetch_20newsgroups

## Import dataset

In [75]:
df = pd.DataFrame(fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data'])

## Topics
- Graphics
- MS Windows
- IBM PC Hardware
- Mac Hardware
- Windows
- Forsale
- Autos
- Motorcycles
- Baseball
- Hockey
- Politics: MISC
- Politics: Guns
- Politics: Mideast
- Cryptography
- Electronics
- Medicine 
- Space
- Religion
- Atheism
- Christianity

In [76]:
len(df)

18846

In [77]:
df.columns = ['n.Text']

In [78]:
df['index'] = df.index

In [79]:
df

Unnamed: 0,n.Text,index
0,\n\nI am sure some bashers of Pens fans are pr...,0
1,My brother is in the market for a high-perform...,1
2,\n\n\n\n\tFinally you said what you dream abou...,2
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4
...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,18841
18842,\nNot in isolated ground recepticles (usually ...,18842
18843,I just installed a DX2-66 CPU in a clone mothe...,18843
18844,\nWouldn't this require a hyper-sphere. In 3-...,18844


In [80]:
import re
from bs4 import BeautifulSoup

In [81]:
len(df)

18846

In [82]:
df['n.Text'][0]

"\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [83]:
s = ''
for i in range(len(df)):
    s += df['n.Text'][i]

In [84]:
text_strings = []

In [85]:
text_df = df

In [86]:
text_df

Unnamed: 0,n.Text,index
0,\n\nI am sure some bashers of Pens fans are pr...,0
1,My brother is in the market for a high-perform...,1
2,\n\n\n\n\tFinally you said what you dream abou...,2
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4
...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,18841
18842,\nNot in isolated ground recepticles (usually ...,18842
18843,I just installed a DX2-66 CPU in a clone mothe...,18843
18844,\nWouldn't this require a hyper-sphere. In 3-...,18844


## Data Preprocessing

In [89]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/akomand/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [90]:
stemmer = nltk.PorterStemmer()

In [91]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [92]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [93]:
doc_sample = text_df[text_df['index'] == 1].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['My', 'brother', 'is', 'in', 'the', 'market', 'for', 'a', 'high-performance', 'video', 'card', 'that', 'supports\nVESA', 'local', 'bus', 'with', '1-2MB', 'RAM.', '', 'Does', 'anyone', 'have', 'suggestions/ideas', 'on:\n\n', '', '-', 'Diamond', 'Stealth', 'Pro', 'Local', 'Bus\n\n', '', '-', 'Orchid', 'Farenheit', '1280\n\n', '', '-', 'ATI', 'Graphics', 'Ultra', 'Pro\n\n', '', '-', 'Any', 'other', 'high-performance', 'VLB', 'card\n\n\nPlease', 'post', 'or', 'email.', '', 'Thank', 'you!\n\n', '', '-', 'Matt\n']


 tokenized and lemmatized document: 
['brother', 'market', 'high', 'perform', 'video', 'card', 'support', 'vesa', 'local', 'suggest', 'idea', 'diamond', 'stealth', 'local', 'orchid', 'farenheit', 'graphic', 'ultra', 'high', 'perform', 'card', 'post', 'email', 'thank', 'matt']


In [94]:
processed_docs = text_df['n.Text'].fillna('').astype(str).map(preprocess)

In [109]:
preprocessed_docs = processed_docs.reset_index()

In [110]:
processed_docs

0        [sure, basher, pen, fan, pretti, confus, lack,...
1        [brother, market, high, perform, video, card, ...
2        [final, say, dream, mediterranean, area, great...
3        [think, scsi, card, transfer, disk, scsi, card...
4        [jasmin, drive, understand, upsat, driver, mod...
                               ...                        
18841    [nyeda, cnsvax, uwec, david, neurolog, consult...
18842    [isol, grind, recepticl, usual, unusu, color, ...
18843    [instal, clone, motherboard, tri, mount, coole...
18844    [wouldn, requir, hyper, sphere, space, point, ...
18845    [gari, crum, crum, fcom, utah, phone, pontiac,...
Name: n.Text, Length: 18846, dtype: object

In [111]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 actual
1 basher
2 beat
3 better
4 bowman
5 confus
6 coupl
7 devil
8 disappoint
9 fan
10 final


In [112]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [113]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [114]:
bow_corpus[10]

[(22, 1),
 (139, 1),
 (154, 1),
 (235, 1),
 (236, 2),
 (237, 1),
 (238, 1),
 (239, 1),
 (240, 1),
 (241, 1),
 (242, 1),
 (243, 1),
 (244, 1),
 (245, 1),
 (246, 1)]

In [115]:
bow_doc_5 = bow_corpus[10]
for i in range(len(bow_doc_5)):
    print(f"Word {bow_doc_5[i][0]} (\"{dictionary[bow_doc_5[i][0]]}\") appears {bow_doc_5[i][1]} time.")

Word 22 ("post") appears 1 time.
Word 139 ("task") appears 1 time.
Word 154 ("modern") appears 1 time.
Word 235 ("anim") appears 1 time.
Word 236 ("blood") appears 2 time.
Word 237 ("cheer") appears 1 time.
Word 238 ("comput") appears 1 time.
Word 239 ("cultur") appears 1 time.
Word 240 ("current") appears 1 time.
Word 241 ("hard") appears 1 time.
Word 242 ("kent") appears 1 time.
Word 243 ("lamb") appears 1 time.
Word 244 ("relat") appears 1 time.
Word 245 ("sacrific") appears 1 time.
Word 246 ("state") appears 1 time.


In [116]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.06319792382897706),
 (1, 0.10246036736355967),
 (2, 0.061710242365574125),
 (3, 0.09975851946803338),
 (4, 0.08238279748384038),
 (5, 0.23536072668007965),
 (6, 0.13822244145478094),
 (7, 0.11050171009577564),
 (8, 0.07981703856801667),
 (9, 0.1399209164870263),
 (10, 0.09946920497630495),
 (11, 0.11367452628270869),
 (12, 0.30795547706619975),
 (13, 0.12937088626962795),
 (14, 0.07832611857184822),
 (15, 0.07196040332975713),
 (16, 0.09638254809927262),
 (17, 0.08144546250537524),
 (18, 0.1182095136302501),
 (19, 0.6557099554957154),
 (20, 0.1189337805247751),
 (21, 0.10770117391775481),
 (22, 0.05410409791300126),
 (23, 0.14037696057914037),
 (24, 0.07708178112664386),
 (25, 0.14083219298026672),
 (26, 0.07648309380105577),
 (27, 0.20455305632255547),
 (28, 0.13272528373655387),
 (29, 0.14376226323393773),
 (30, 0.08631374062181993),
 (31, 0.19305217187772214),
 (32, 0.07589903825126124),
 (33, 0.11427203186535144),
 (34, 0.05951642995799403),
 (35, 0.037416902892527956),
 (36

In [117]:
corpus_tfidf[10]

[(22, 0.1248486548127264),
 (139, 0.26729575344838996),
 (154, 0.2406355393653195),
 (235, 0.2327284376014387),
 (236, 0.47545285845912844),
 (237, 0.24568532506601917),
 (238, 0.2207819587483869),
 (239, 0.24250515624765553),
 (240, 0.1664109385271022),
 (241, 0.1661126770309119),
 (242, 0.28316096411917724),
 (243, 0.3517542497864101),
 (244, 0.19684090823190872),
 (245, 0.2972338971797536),
 (246, 0.14365285538012476)]

## Latent Dirichlet Allocation

In [121]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [122]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.018*"drive" + 0.012*"chip" + 0.011*"scsi" + 0.011*"disk" + 0.009*"like" + 0.008*"know" + 0.008*"need" + 0.007*"work" + 0.007*"problem" + 0.007*"time"
Topic: 1 
Words: 0.014*"christian" + 0.011*"peopl" + 0.009*"know" + 0.008*"jesu" + 0.008*"right" + 0.007*"think" + 0.007*"believ" + 0.007*"say" + 0.007*"bibl" + 0.006*"word"
Topic: 2 
Words: 0.020*"window" + 0.009*"file" + 0.008*"softwar" + 0.008*"program" + 0.007*"includ" + 0.007*"mail" + 0.007*"version" + 0.007*"server" + 0.006*"avail" + 0.006*"graphic"
Topic: 3 
Words: 0.023*"armenian" + 0.014*"turkish" + 0.009*"turkey" + 0.008*"greek" + 0.007*"turk" + 0.006*"govern" + 0.006*"peopl" + 0.005*"armenia" + 0.005*"genocid" + 0.005*"know"
Topic: 4 
Words: 0.010*"planet" + 0.010*"earth" + 0.007*"myer" + 0.007*"moon" + 0.007*"orbit" + 0.006*"like" + 0.005*"solar" + 0.005*"atmospher" + 0.005*"spacecraft" + 0.005*"surfac"
Topic: 5 
Words: 0.007*"drug" + 0.005*"peopl" + 0.005*"state" + 0.005*"year" + 0.005*"like" + 0.004*"studi

In [123]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)

In [124]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.005*"encrypt" + 0.003*"chip" + 0.003*"window" + 0.003*"key" + 0.003*"escrow" + 0.003*"know" + 0.003*"need" + 0.003*"file" + 0.003*"thank" + 0.003*"think"
Topic: 1 Word: 0.005*"game" + 0.005*"catcher" + 0.003*"time" + 0.003*"play" + 0.003*"stat" + 0.003*"know" + 0.003*"john" + 0.003*"think" + 0.003*"pitcher" + 0.003*"good"
Topic: 2 Word: 0.006*"thank" + 0.005*"file" + 0.005*"mail" + 0.004*"window" + 0.004*"look" + 0.004*"know" + 0.003*"email" + 0.003*"printer" + 0.003*"post" + 0.003*"repli"
Topic: 3 Word: 0.007*"game" + 0.005*"think" + 0.004*"team" + 0.004*"year" + 0.004*"cap" + 0.004*"peopl" + 0.003*"pen" + 0.003*"isl" + 0.003*"wing" + 0.003*"know"
Topic: 4 Word: 0.008*"armenian" + 0.004*"turkish" + 0.003*"turkey" + 0.003*"turk" + 0.003*"like" + 0.003*"card" + 0.003*"greek" + 0.002*"know" + 0.002*"muslim" + 0.002*"good"
Topic: 5 Word: 0.005*"batf" + 0.004*"warrant" + 0.004*"govern" + 0.003*"koresh" + 0.003*"time" + 0.003*"say" + 0.003*"like" + 0.003*"peopl" + 0.002*"ri

## Topics Extracted