In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'D:/Data/NLP/TextFiles/quora_questions.csv')

In [3]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


----

# Preprocessing

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [6]:
dtm = tfidf.fit_transform(df['Question'])

In [7]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

----

# Non-negative Matrix Factorization

In [8]:
from sklearn.decomposition import NMF

In [9]:
nmf = NMF(n_components=20,random_state=42)

In [10]:
nmf.fit(dtm)



NMF(n_components=20, random_state=42)

----

# Top 20 words in each topic

In [18]:
for index,topic in enumerate(nmf.components_):
    print(f'The TOP 15 WORDS FOR TOPIC:{index}')
    words=[]
    for i in topic.argsort()[-20:]:
        words.append(tfidf.get_feature_names_out()[i])
    print(words)
    print('\n')

The TOP 15 WORDS FOR TOPIC:0
['app', 'engineering', 'friend', 'website', 'site', 'thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


The TOP 15 WORDS FOR TOPIC:1
['come', 'relationship', 'says', 'universities', 'grads', 'majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


The TOP 15 WORDS FOR TOPIC:2
['users', 'writer', 'marked', 'search', 'use', 'add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


The TOP 15 WORDS FOR TOPIC:3
['com', 'facebook', 'job', 'easiest', 'making', 'using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


The TOP 15 WORDS FOR TOPIC:4
['embarrassing', 'decision', 'biggest', 'work', 'did', 'balance', 'earth', 'day

----

# Attaching discoverd labels to original Articles

In [14]:
topic_results = nmf.transform(dtm)

In [15]:
df['Topic'] = topic_results.argmax(axis=1)

In [16]:
df.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
