In [35]:
"""
This project clustering quora questions into topics by topic modeling 
"""
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [36]:
#Load data
quora = pd.read_csv('quora_questions.csv')

In [37]:
#show data
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [38]:
#preprocess the data
quora.dropna(inplace=True)
blanks = [] 
for i in quora.itertuples():
    if type(i) == str: 
        if i.isspace():
            blanks.append(i)
quora.drop(blanks, inplace = True)

In [39]:
#tf_idf
tfidf = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
train_data = tfidf.fit_transform(quora['Question'])

In [40]:
#train model
nmf_model = NMF(n_components=5, random_state=101)
nmf_model.fit(train_data)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=5, random_state=101, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [41]:
#print out common word for each topic 
for index,topic in enumerate(nmf_model.components_):
    print(f"Topic number {index} -> ")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

Topic number 0 -> 
['learning', 'time', 'weight', '2016', 'ways', 'english', 'language', 'movies', 'programming', 'book', 'books', 'india', 'learn', 'way', 'best']


Topic number 1 -> 
['girl', 'good', 'really', 'love', 'donald', 'long', 'time', 'sex', 'trump', 'india', 'work', 'feel', 'like', 'mean', 'does']


Topic number 2 -> 
['answered', 'use', 'improvement', 'think', 'delete', 'easily', 'asked', 'google', 'answer', 'answers', 'ask', 'question', 'questions', 'people', 'quora']


Topic number 3 -> 
['easy', 'rs', 'rupee', 'youtube', 'india', 'black', 'ways', 'way', 'notes', '1000', '500', 'earn', 'online', 'make', 'money']


Topic number 4 -> 
['people', 'did', 'employees', 'want', 'real', 'love', 'day', 'good', 'things', 'important', 'thing', 'meaning', 'know', 'purpose', 'life']




In [44]:
#add the result to df 
topics = nmf_model.transform(train_data)
topics.argmax(axis=1)
quora['Topic'] = topics.argmax(axis=1)

In [45]:
quora.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,4
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,1
4,"Which one dissolve in water quikly sugar, salt...",1
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,4
8,When do you use シ instead of し?,2
9,Motorola (company): Can I hack my Charter Moto...,0
