In [1]:
# Import Package
import pandas as pd

In [2]:
# Read the quora questions file
npr = pd.read_csv('quora_questions.csv')

npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
# Using TF-IDF Vectorization to create a vectorized document term matrix

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

dtm = cv.fit_transform(npr['Article'])

dtm # Document term matrix

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [4]:
### LDA

from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=7,random_state=42)

LDA.fit(dtm)

LatentDirichletAllocation(n_components=7, random_state=42)

In [5]:
# Store words for topics

len(cv.get_feature_names())

import random

for i in range(10):
    random_word_id = random.randint(0,54776)
    print(cv.get_feature_names()[random_word_id])

for i in range(10):
    random_word_id = random.randint(0,54776)
    print(cv.get_feature_names()[random_word_id])



odd
onset
sartorial
conservationists
infrequent
whey
1784
hertfordshire
notables
respirators
knitting
toed
trafficked
gaithersburg
tau
fuming
st
minimizing
darkroom
procuring


In [6]:
# Top words per topic

len(LDA.components_)

LDA.components_

len(LDA.components_[0])

single_topic = LDA.components_[0]

single_topic.argsort() # Return indices 

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

In [7]:
# Top 10 words for this topic

single_topic.argsort()[-10:] # Return indices

top_word_indices = single_topic.argsort()[-10:]

In [8]:
# Print top words 
for index in top_word_indices:
    print(cv.get_feature_names()[index])

new
percent
government
company
million
care
people
health
said
says


In [9]:
# View first 10 topics

for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


THE TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


THE TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


THE TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

In [10]:
# Linking the topics and documents.

dtm

topic_results = LDA.transform(dtm)

topic_results[0]

topic_results[0].round(2)

topic_results[0].argmax() # The first document belings to topic 1.

1

In [11]:
# Topic retrieval for original data

npr.head()

topic_results.argmax(axis=1)

npr['Topic'] = topic_results.argmax(axis=1)

npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2


### Delivery 4: Natural Language Processing - Topic Modelling
Quora Questions Answers - Topic Modelling <br>
Perform Topic Modelling for Quora Questions Answers using NTLK and other required Python packages and provide the following information for Quora:

- How many questions are asked?
- What is the dimension of document term matrix(DTM)?
- How many topics are there?
- What are the 10 most common words for each topic?
- Map each question to the right topic.
- Which is the topic people are mostly interested in?
- Which is the least interesting topic for people?