# Topic modeling with latent Dirichlet allocation

In [1]:
import gensim
import pandas as pd
import json

In [2]:
import pandas as pd
import json

articles = pd.read_pickle('ArticleMetadata.pkl')
articles.DatePublished = pd.to_datetime(articles.DatePublished)
articles.Tags = articles.Tags.map(lambda x: str(x))
articles.TagArray = articles.Tags.map(lambda x: x.split(','))
articles.TagArray[0]
articles.head(1)

Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,2015-09-02 10:56:24,"""The show is a vibrant look at the early PC in...","""AMC's Halt and Catch Fire is a brilliant achi..."


In [29]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def StripHtml(html):
    return strip_tags(html)

print(StripHtml('<b>hello</b>'))
print(type(StripHtml('<b>hello</b>')))

hello
<class 'str'>


In [36]:
# prepare new field for LDA:    

articles["RawText"] = articles.FullText.map(lambda x: StripHtml(x))
articles["RawText"] = articles.RawText + ' ' +  articles.Title + ' ' + articles.Tags + ' ' + articles.Abstract  + ' ' + articles.Topic 
articles["RawText"].head()

ArticleId
12897    "AMC's Halt and Catch Fire is a brilliant achi...
58871    "Bush, Kennedy, Romney, Clinton, and, yes, eve...
58872    "How much government spending is enough, and h...
58873    "Progressive politicians have found a ripe old...
58874    "On Saturday night, millions of rich people pl...
Name: RawText, dtype: object

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(binary=False,stop_words='english',min_df=3)
docs = cv.fit_transform(articles.RawText.dropna())
# Build a mapping of numerical ID to word
id2word = dict(enumerate(cv.get_feature_names()))

In [49]:
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
# First we convert our word-matrix into gensim's format
corpus = Sparse2Corpus(docs, documents_columns = False)
# Then we fit an LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10)

In [54]:
num_topics= 10
num_words_per_topic= 5

for ti, topic in enumerate(lda_model.show_topics(num_topics,num_words_per_topic)):
    print("Topic:	%d" %	(ti))
    print(topic)
    print()

Topic:	0
(6, '0.001*government + 0.001*cleveland + 0.001*ip + 0.001*maduro + 0.001*scotland')

Topic:	1
(0, '0.002*government + 0.001*people + 0.001*tax + 0.001*facebook + 0.001*http')

Topic:	2
(13, '0.001*mellon + 0.001*augustine + 0.001*tubman + 0.001*workweek + 0.000*linkedin')

Topic:	3
(14, '0.001*government + 0.001*people + 0.001*licensing + 0.001*care + 0.001*taxis')

Topic:	4
(10, '0.002*trump + 0.001*government + 0.001*socrates + 0.001*silk + 0.001*people')

Topic:	5
(9, '0.001*pinker + 0.001*postmodernism + 0.000*harry + 0.000*government + 0.000*ensemble')

Topic:	6
(4, '0.001*tsa + 0.001*government + 0.001*ridesharing + 0.001*people + 0.001*wage')

Topic:	7
(8, '0.005*bitcoin + 0.002*refugees + 0.001*government + 0.001*police + 0.001*people')

Topic:	8
(5, '0.002*outsourcing + 0.001*blockchain + 0.001*trek + 0.001*government + 0.001*flint')

Topic:	9
(2, '0.001*walmart + 0.001*government + 0.001*mouse + 0.001*fannie + 0.001*quarantine')

