# Topic modeling with latent Dirichlet allocation

In [1]:
import gensim
import pandas as pd
import json

In [2]:
import pandas as pd
import json

articles = pd.read_pickle('ArticleMetadata.pkl')
articles.DatePublished = pd.to_datetime(articles.DatePublished)
articles.Tags = articles.Tags.map(lambda x: str(x))
articles.TagArray = articles.Tags.map(lambda x: x.split(','))
articles.TagArray[0]
articles.head(1)

Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,2015-09-02 10:56:24,"""The show is a vibrant look at the early PC in...","""AMC's Halt and Catch Fire is a brilliant achi..."


In [29]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def StripHtml(html):
    return strip_tags(html)

print(StripHtml('<b>hello</b>'))
print(type(StripHtml('<b>hello</b>')))

hello
<class 'str'>


In [36]:
# prepare new field for LDA:    

articles["RawText"] = articles.FullText.map(lambda x: StripHtml(x))
articles["RawText"] = articles.RawText + ' ' +  articles.Title + ' ' + articles.Tags + ' ' + articles.Abstract  + ' ' + articles.Topic 
articles["RawText"].head()

ArticleId
12897    "AMC's Halt and Catch Fire is a brilliant achi...
58871    "Bush, Kennedy, Romney, Clinton, and, yes, eve...
58872    "How much government spending is enough, and h...
58873    "Progressive politicians have found a ripe old...
58874    "On Saturday night, millions of rich people pl...
Name: RawText, dtype: object

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=False,stop_words='english',min_df=3)
docs = cv.fit_transform(articles.RawText.dropna())
# Build a mapping of numerical ID to word
id2word = dict(enumerate(cv.get_feature_names()))

In [45]:
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
# First we convert our word-matrix into gensim's format
corpus = Sparse2Corpus(docs, documents_columns = False)
# Then we fit an LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=15)

In [47]:
num_topics= 20
num_words_per_topic= 5

for ti, topic in enumerate(lda_model.show_topics(num_topics,num_words_per_topic)):
    print("Topic:	%d" %	(ti))
    print(topic)
    print()

Topic:	0
(0, '0.007*health + 0.007*government + 0.006*care + 0.006*people + 0.004*market')

Topic:	1
(1, '0.008*government + 0.007*money + 0.006*tax + 0.006*market + 0.006*people')

Topic:	2
(2, '0.009*people + 0.007*economic + 0.005*government + 0.005*percent + 0.005*new')

Topic:	3
(3, '0.012*property + 0.010*government + 0.006*rights + 0.005*economic + 0.005*private')

Topic:	4
(4, '0.004*government + 0.004*market + 0.004*law + 0.003*people + 0.003*new')

Topic:	5
(5, '0.007*government + 0.006*market + 0.005*internet + 0.004*free + 0.003*time')

Topic:	6
(6, '0.008*people + 0.007*government + 0.006*state + 0.006*market + 0.005*political')

Topic:	7
(7, '0.006*market + 0.006*people + 0.005*government + 0.004*mises + 0.004*economics')

Topic:	8
(8, '0.009*free + 0.006*people + 0.006*economic + 0.005*trade + 0.005*world')

Topic:	9
(9, '0.007*government + 0.005*state + 0.005*new + 0.004*time + 0.004*war')

Topic:	10
(10, '0.009*government + 0.007*people + 0.006*state + 0.005*percent + 