# Load Data

In [1]:
import pandas as pd 
import numpy as np 
data = pd.read_csv('questions.csv')

In [2]:
data.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Data Preprocessing

In [3]:
# load TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer(max_df=0.9, min_df=2,stop_words='english')

In [5]:
# fit the vectorizer to the data 
data_fit = tfidf.fit_transform(data['Question'])

In [6]:
data_fit.shape

(404289, 38669)

In [7]:
data_fit[0]

<1x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

# Load Non-Negative Matrix Factorization (NMF)

In [8]:
from sklearn.decomposition import NMF

In [9]:
# create an object of NMF 
nmf = NMF(n_components=15)

In [10]:
nmf.fit(data_fit)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=15, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [11]:
nmf.components_[0].max()

7.738458857973126

#  Print our the top 15 most common words for each of the 15 topics

In [12]:
# print the highest repeated words in a single topic 
topic = nmf.components_[0]
# print the indexes of the words sorted from the lowest frequency (or common) to the highest
topic.argsort()

array([    0, 21787, 21784, ..., 22925, 37515,  4632], dtype=int64)

In [13]:
# print the index of the most 15 common words 
words_index=topic.argsort()[-15:]

In [14]:
# print these words 
for i in words_index:
    print(tfidf.get_feature_names()[i])

place
visit
places
phone
time
ways
buy
laptop
movie
2016
books
book
movies
way
best


In [15]:
# create a for loop to print the words for each of the 15 topics 
for i,topic in enumerate(nmf.components_):
    print(f'The most common words topic {i} are')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

The most common words topic 0 are
['place', 'visit', 'places', 'phone', 'time', 'ways', 'buy', 'laptop', 'movie', '2016', 'books', 'book', 'movies', 'way', 'best']


The most common words topic 1 are
['recruit', 'differ', 'looking', 'use', 'sex', 'exist', 'time', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


The most common words topic 2 are
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


The most common words topic 3 are
['facebook', 'friends', 'black', 'internet', 'free', 'easiest', 'home', 'easy', 'youtube', 'ways', 'way', 'earn', 'online', 'make', 'money']


The most common words topic 4 are
['earth', 'did', 'death', 'changed', 'day', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


The most common words topic 5 are
['minister', 'company', 'engineering', 'china', 'olympics', 'available', 'business', 'j

In [16]:
# create a column for each topic in the data
results = nmf.transform(data_fit)
data['Topic'] = results.argmax(axis=1)

In [17]:
data['Topic'][0]

5

In [18]:
data.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,8
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
