# Mount Google Drive

In [1]:
from google.colab import drive
drive.mount("/content/Drive")

base_path = "/content/Drive/MyDrive/NLP-Course/05-Topic-Modeling/"

Drive already mounted at /content/Drive; to attempt to forcibly remount, call drive.mount("/content/Drive", force_remount=True).


# Topic Modeling Assessment Project

In [2]:
import pandas as pd

In [3]:
quora = pd.read_csv(base_path + "quora_questions.csv")
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# Preprocessing

#### Task: Use TF-IDF Vectorization to create a vectorized document term matrix. You may want to explore the max_df and min_df parameters.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

dtm = tfidf.fit_transform(quora['Question'])

dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

# Non-negative Matrix Factorization

#### TASK: Using Scikit-Learn create an instance of NMF with 20 expected components. (Use random_state=42)..

In [5]:
import warnings
warnings.filterwarnings("ignore")

from sklearn import set_config
from sklearn.decomposition import NMF

set_config(print_changed_only=False)

nmf_model = NMF(n_components=20,random_state=42)

nmf_model.fit(dtm)

NMF(alpha='deprecated', alpha_H='same', alpha_W=0.0, beta_loss='frobenius',
    init='warn', l1_ratio=0.0, max_iter=200, n_components=20, random_state=42,
    regularization='deprecated', shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

#### TASK: Print our the top 15 most common words for each of the 20 topics.

In [6]:
for index, topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 15 WORDS FOR TOPIC #2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


THE TOP 15 WORDS FOR TOPIC #4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


THE TOP 15 WORDS FOR TOPIC #5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 

#### TASK: Add a new column to the original quora dataframe that labels each question into one of the 20 topic categories.

In [7]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [8]:
topic_results = nmf_model.transform(dtm)
topic_results

array([[2.75937605e-04, 5.91249293e-05, 6.17687040e-06, ...,
        6.97269969e-04, 2.13527728e-04, 0.00000000e+00],
       [1.96418670e-04, 8.85438224e-05, 0.00000000e+00, ...,
        0.00000000e+00, 5.51088847e-05, 1.05527238e-05],
       [1.78019854e-04, 6.47373072e-04, 1.60510763e-03, ...,
        3.02354836e-03, 1.05908512e-03, 1.23878889e-03],
       ...,
       [0.00000000e+00, 1.62431955e-05, 5.23720795e-06, ...,
        0.00000000e+00, 2.76279348e-06, 0.00000000e+00],
       [5.36236094e-04, 1.01567857e-03, 0.00000000e+00, ...,
        1.28720137e-04, 7.76975481e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.25187210e-04]])

In [9]:
topic_results.argmax(axis=1)

array([ 5, 16, 17, ..., 11, 11,  9])

In [10]:
quora['Topic'] = topic_results.argmax(axis=1)

quora.head(10)

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,10
8,When do you use シ instead of し?,19
9,Motorola (company): Can I hack my Charter Moto...,17
