### Topic Modeling

#### Reading in the quora_questions.csv file.

In [3]:
import pandas
df = pandas.read_csv("quora_questions.csv")
df.shape

### Preprocessing

#### Using TF-IDF Vectorization to create a vectorized document term matrix

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
tfidf = TfidfVectorizer(max_df = 0.90, min_df = 2, stop_words='english')

In [7]:
dtf = tfidf.fit_transform(df["Question"])

### Non-negative Matrix Factorization

#### Using Scikit-Learn create an instance of NMF

In [8]:
from sklearn.decomposition import NMF

In [9]:
nmf = NMF(n_components = 7, random_state = 42)

#### Printing out the top 15 most common words

In [30]:
nmf.fit(dtf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [43]:
print(nmf.components_[0][:10])
print(dtf.get_shape())

[1.03319544e-04 5.08796629e-02 4.64506947e-05 1.91558679e-03
 9.92848423e-06 1.11187957e-04 9.76432756e-07 1.90848356e-05
 0.00000000e+00 0.00000000e+00]
(404289, 38669)


In [33]:
for index, topic in enumerate(nmf.components_):
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]][::-1])
    print("\n\n")

['best', 'way', 'movies', 'book', 'books', 'weight', '2016', 'ways', 'movie', 'time', 'laptop', 'buy', 'lose', 'india', 'phone']



['does', 'mean', 'like', 'feel', 'work', 'time', 'sex', 'long', 'love', 'girl', 'really', 'cost', 'look', 'compare', 'new']



['quora', 'people', 'questions', 'question', 'ask', 'answers', 'answer', 'google', 'asked', 'easily', 'delete', 'improvement', 'use', 'answered', 'post']



['money', 'make', 'online', 'earn', 'way', 'ways', 'youtube', 'black', '500', '1000', 'notes', 'easy', 'home', 'rupee', 'easiest']



['life', 'purpose', 'meaning', 'know', 'thing', 'important', 'day', 'things', 'love', 'real', 'want', 'like', 'employees', 'live', 'moment']



['india', 'trump', 'donald', 'president', 'clinton', 'hillary', 'did', 'think', 'win', '500', 'notes', 'people', '1000', 'war', 'election']



['learn', 'english', 'language', 'programming', 'improve', 'good', 'way', 'start', 'skills', 'learning', 'speak', 'java', 'writing', 'languages', 'speaking']





#### Adding a new column to the original quora dataframe that labels each question into one of the 7 topic categories

In [50]:
topic_results = nmf.transform(dtf)
df["topic"] = topic_results.argmax(axis=1)

In [48]:
topic_results[0].argmax()

5

In [51]:
df

Unnamed: 0,Question,topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,4
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,1
4,"Which one dissolve in water quikly sugar, salt...",1
...,...,...
404284,How many keywords are there in the Racket prog...,6
404285,Do you believe there is life after death?,4
404286,What is one coin?,5
404287,What is the approx annual cost of living while...,5
