# Topic Modeling Using LDA and NMF
**Data:** Quora.com questions without labels, >400,000 observations
> Suggested 20 categories

In [1]:
import pandas as pd

In [22]:
df = pd.read_csv('quora_questions.csv')
display(df.head())

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


## Text pre-processing
* Clean data, create TF-IDF Vectorization

In [3]:
raw_leng = len(df)
display(df.isnull().sum())
df.dropna(inplace=True)
display(df.isnull().sum())

blanks = []
for i,q in df.itertuples():
    if type(q) == str:
        if q.isspace():
            blanks.append(i)
df.drop(blanks,inplace=True)

clean_leng = len(df)

print("Removed {} empty rows from the data; now contains {} rows".format(raw_leng - clean_leng, clean_leng))

Question    0
dtype: int64

Question    0
dtype: int64

Removed 0 empty rows from the data; now contains 404289 rows


In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer(max_df=0.9,min_df=10,stop_words='english')
tfidf = TfidfVectorizer(max_df=0.9,min_df=10,stop_words='english')

dtm_cv = cv.fit_transform(df['Question'])
dtm_tf = tfidf.fit_transform(df['Question'])

## Model with Non-Negative Matrix Factorization
* import, define, fit

In [14]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

nmf_model = NMF(n_components=20, random_state=42)
nmf_model.fit(dtm_tf)

LDA_model = LatentDirichletAllocation(n_components=20, random_state=42)
LDA_model.fit(dtm_cv)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=20, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

## Evaluate common terms in each suggested topic

In [15]:
for i,t in enumerate(nmf_model.components_):
    print(f'Top 15 words for topic # {i}')
    print([tfidf.get_feature_names()[i] for i in t.argsort()[-15:]][::-1])
    print()

Top 15 words for topic # 0
['best', 'movies', 'book', 'books', '2016', 'ways', 'movie', 'laptop', 'buy', 'phone', 'places', 'visit', 'place', 'read', 'site']

Top 15 words for topic # 1
['does', 'mean', 'work', 'feel', 'long', 'cost', 'compare', 'really', 'exist', 'use', 'differ', 'looking', 'recruit', 'sex', 'grads']

Top 15 words for topic # 2
['quora', 'questions', 'question', 'ask', 'answer', 'answers', 'google', 'asked', 'delete', 'improvement', 'easily', 'post', 'needing', 'answered', 'add']

Top 15 words for topic # 3
['money', 'make', 'online', 'earn', 'ways', 'youtube', 'easy', 'home', 'com', 'free', 'internet', 'black', 'friends', 'website', 'using']

Top 15 words for topic # 4
['life', 'purpose', 'meaning', 'thing', 'important', 'real', 'moment', 'live', 'want', 'change', 'changed', 'death', 'day', 'earth', 'work']

Top 15 words for topic # 5
['india', 'pakistan', 'war', 'spotify', 'job', 'available', 'country', 'olympics', 'china', 'engineering', 'minister', 'company', 'res

In [16]:
for i,t in enumerate(LDA_model.components_):
    print(f'Top 15 words for topic # {i}')
    print([tfidf.get_feature_names()[i] for i in t.argsort()[-15:]][::-1])
    print()

Top 15 words for topic # 0
['does', 'know', 'start', 'feel', 'really', 'like', 'don', 'want', 'rid', 'mind', 'exist', 'friends', 'people', 'need', 'build']

Top 15 words for topic # 1
['new', 'does', 'computer', 'science', 'interesting', 'worth', 'music', 'looking', 'year', '2017', 'biggest', 'house', 'city', 'facts', 'apple']

Top 15 words for topic # 2
['love', 'girl', 'read', 'books', 'guy', 'favorite', 'friend', 'tell', 'girlfriend', 'best', 'class', 'know', 'time', 'history', 'man']

Top 15 words for topic # 3
['improve', 'country', 'change', 'car', 'password', 'english', 'email', 'skills', 'effects', 'gmail', 'writing', 'air', 'countries', 'does', 'legal']

Top 15 words for topic # 4
['did', 'world', 'day', 'things', 'war', 'going', 'happen', 'places', 'visit', 'know', 'pakistan', 'happened', 'employees', 'new', 'end']

Top 15 words for topic # 5
['does', 'way', 'best', 'mean', 'learn', 'long', 'sex', 'lose', 'work', 'ways', 'safe', 'weight', 'police', 'hotel', 'relationship']

T

## Indicate topic in dataframe

In [23]:
nmf_topic_results = nmf_model.transform(dtm_tf)
LDA_topic_results = LDA_model.transform(dtm_tf)

df['NMF_Topic'] = nmf_topic_results.argmax(axis=1)
df['LDA_Topic'] = LDA_topic_results.argmax(axis=1)
df.head()

Unnamed: 0,Question,NMF_Topic,LDA_Topic
0,What is the step by step guide to invest in sh...,5,2
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,17,19
2,How can I increase the speed of my internet co...,18,9
3,Why am I mentally very lonely? How can I solve...,13,0
4,"Which one dissolve in water quikly sugar, salt...",1,0


In [26]:
df['NMF_Topic'].value_counts()

19    43485
0     34101
11    33025
1     29967
12    26315
5     24029
18    22262
8     20395
16    19562
13    17420
9     16968
3     16646
2     15322
6     14542
10    13315
15    13108
17    12226
4     11448
7     10301
14     9852
Name: NMF_Topic, dtype: int64