<a href="https://colab.research.google.com/github/AliEbadi110/Natural-Language-Processing-Topic-Modeling-Sample-Projects/blob/main/NLP_Scikit_Learn_Topic_Modeling_LDA_NPR_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Scikit Learn - Topic Modeling - LDA - NPR Data**

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import classification_report, confusion_matrix

## 1. Loading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Datasets/npr.csv')
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


## 2. Data Overview

In [None]:
df.shape

(11992, 1)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11992 entries, 0 to 11991
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  11992 non-null  object
dtypes: object(1)
memory usage: 93.8+ KB


In [None]:
df.describe().transpose()

Unnamed: 0,count,unique,top,freq
Article,11992,11991,"Washington state has released an estimated 3, ...",2


## 3. Handling Missing Values

In [None]:
df.isnull().sum()

Article    0
dtype: int64

## 4. Feature Extraction

In [None]:
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [None]:
dtm = cv.fit_transform(df['Article'])

In [None]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

## 5. Define, Fit Model

In [None]:
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [None]:
LDA.fit(dtm)

## 6. Predict, Evaluate

In [None]:
# Top 15 words with highest probabiltiy per topic
for i, topic in enumerate(LDA.components_):
  print(f'THE TOP 15 WORDS FOR TOPIC #{i}')
  print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
  print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


THE TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


THE TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


THE TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

In [None]:
topic_results = LDA.transform(dtm)

In [None]:
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [None]:
df['Topic'] = topic_results.argmax(axis=1)

In [None]:
df.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
