# Topic modeling with latent Dirichlet allocation

**Potential Applications:** classify articles into topics

**Evaluation**: This implementation of LDA using the full text of articles is not good at topic classification.

In [1]:
import gensim
import pandas as pd
import json

In [2]:
import pandas as pd
import json

articles = pd.read_pickle('ArticleMetadata.pkl')
articles.DatePublished = pd.to_datetime(articles.DatePublished)
articles.Tags = articles.Tags.map(lambda x: str(x))
articles.TagArray = articles.Tags.map(lambda x: x.split(','))
articles.TagArray[0]
articles.head(1)

Unnamed: 0_level_0,Url,Title,Tags,Topic,DatePublished,Abstract,FullText
ArticleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12897,/articles/amc-s-halt-and-catch-fire-is-capital...,"AMC’s ""Halt and Catch Fire"" Is Capitalism's Fi...","Capitalism,Competition,Property Rights,Entrepr...",Economics,2015-09-02 10:56:24,"""The show is a vibrant look at the early PC in...","""AMC's Halt and Catch Fire is a brilliant achi..."


In [3]:
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def StripHtml(html):
    return strip_tags(html)

print(StripHtml('<b>hello</b>'))
print(type(StripHtml('<b>hello</b>')))

hello
<class 'str'>


In [4]:
# prepare new field for LDA:    

articles["RawText"] = articles.FullText.map(lambda x: StripHtml(x))
articles["RawText"] = articles.RawText + ' ' +  articles.Title
# + ' ' + articles.Tags + ' ' + articles.Abstract
articles["RawText"].head()

ArticleId
12897    "AMC's Halt and Catch Fire is a brilliant achi...
58871    "Bush, Kennedy, Romney, Clinton, and, yes, eve...
58872    "How much government spending is enough, and h...
58873    "Progressive politicians have found a ripe old...
58874    "On Saturday night, millions of rich people pl...
Name: RawText, dtype: object

In [5]:
# from stemming.porter2 import stem

# def stemArticle(text):
#     print(text)
#     text = " ".join([stem(word) for word in text.split(" ")])
#     return text
   
# # articles.head(1).RawText.map(stemArticle)
# # articles.head(5).RawText.map(lambda x: stemArticle(x))
# articles.RawText = articles.RawText.map(lambda x: stemArticle(x))

# articles.RawText.head(5)

In [6]:
articles["RawText"].head()

ArticleId
12897    "AMC's Halt and Catch Fire is a brilliant achi...
58871    "Bush, Kennedy, Romney, Clinton, and, yes, eve...
58872    "How much government spending is enough, and h...
58873    "Progressive politicians have found a ripe old...
58874    "On Saturday night, millions of rich people pl...
Name: RawText, dtype: object

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(binary=False,stop_words='english',min_df=3)
docs = cv.fit_transform(articles.RawText.dropna())
# Build a mapping of numerical ID to word
id2word = dict(enumerate(cv.get_feature_names()))

In [8]:
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
# First we convert our word-matrix into gensim's format
corpus = Sparse2Corpus(docs, documents_columns = False)
# Then we fit an LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=10)

In [12]:
num_topics= 10
num_words_per_topic= 10

for ti, topic in enumerate(lda_model.show_topics(num_topics,num_words_per_topic)):
    print("Topic:	%d" %	(ti))
    print(topic)
    print()

Topic:	0
(0, '0.001*government + 0.000*tax + 0.000*people + 0.000*moore + 0.000*tubman + 0.000*soto + 0.000*education + 0.000*trade + 0.000*new + 0.000*economic')

Topic:	1
(1, '0.001*government + 0.001*venezuela + 0.001*market + 0.001*free + 0.001*state + 0.001*maduro + 0.001*erhard + 0.001*economic + 0.001*jury + 0.001*amazon')

Topic:	2
(2, '0.002*government + 0.002*people + 0.002*market + 0.001*economic + 0.001*world + 0.001*free + 0.001*state + 0.001*new + 0.001*money + 0.001*percent')

Topic:	3
(3, '0.001*government + 0.001*people + 0.001*pencil + 0.001*social + 0.001*economic + 0.001*scalia + 0.001*market + 0.001*venezuela + 0.001*homeschooling + 0.001*tax')

Topic:	4
(4, '0.002*people + 0.002*government + 0.001*free + 0.001*trade + 0.001*state + 0.001*like + 0.001*world + 0.001*don + 0.001*market + 0.001*uber')

Topic:	5
(5, '0.002*government + 0.001*market + 0.001*people + 0.001*free + 0.001*state + 0.001*google + 0.001*economic + 0.001*new + 0.001*housing + 0.001*public')

To