<a href="https://colab.research.google.com/github/AliEbadi110/Natural-Language-Processing-Topic-Modeling-Sample-Projects/blob/main/NLP_Scikit_Learn_Topic_Modeling_NMF_Quora_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Scikit Learn - Topic Modeling - NMF - Quora Data**

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import classification_report, confusion_matrix

## 1. Loading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Datasets/quora_questions.csv')
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


## 2. Data Overview

In [None]:
df.shape

(404289, 1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404289 entries, 0 to 404288
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Question  404289 non-null  object
dtypes: object(1)
memory usage: 3.1+ MB


In [None]:
df.describe().transpose()

Unnamed: 0,count,unique,top,freq
Question,404289,290456,How do I improve my English speaking?,50


## 3. Handling Missing Values

In [None]:
df.isnull().sum()

Question    0
dtype: int64

## 4. Feature Extraction

In [None]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [None]:
dtm = tfidf.fit_transform(df['Question'])

In [None]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

## 5. Define, Fit Model

In [None]:
nmf_model = NMF(n_components=20, random_state=42)

In [None]:
nmf_model.fit(dtm)



## 6. Predict

In [None]:
# Top 15 words with highest probabiltiy per topic
for i, topic in enumerate(nmf_model.components_):
  print(f'THE TOP 15 WORDS FOR TOPIC #{i}')
  print([tfidf.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
  print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


THE TOP 15 WORDS FOR TOPIC #4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


THE TOP 15 WORDS FOR TOPIC #5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 

In [None]:
topic_results = nmf_model.transform(dtm)

In [None]:
topic_results[0].round(2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.03, 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [None]:
df['Topic'] = topic_results.argmax(axis=1)

In [None]:
df.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
