In [1]:
import numpy as np
import pandas as pd
from google.colab import drive
import warnings
warnings.filterwarnings('ignore')
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
df = pd.read_csv('/content/drive/MyDrive/Notebooks/npr.csv')
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
df['Article'].isna().sum()

0

In [5]:
df['Article'][1]

'  Donald Trump has used Twitter  —   his preferred means of communication  —   to weigh in on a swath of foreign policy issues over the past few weeks. His comments give a glimpse into how his incoming administration will deal with pressing foreign matters  —   but also highlight how reactionary comments on social media can immediately spur international concern and attention. And his staff has indicated that taking to Twitter to air his concerns or, often, grievances, won’t end once he enters the Oval Office. On Wednesday, Trump blasted the U. S.’s abstention from the U. N. Security Council vote on Israeli settlements earlier this month. The tweets came just hours before Secretary of State John Kerry gave a speech defending the decision and calling the continued building of settlements on Palestinian territory in the West Bank a threat to the   solution in the region. Trump’s support for Israel and Prime Minister Benjamin Netanyahu  —   who has had a fraught relationship with Preside

In [21]:
import re
import spacy
import string

nlp = spacy.load('en_core_web_sm')

def clean_txt(text):
  text = text.lower()

  # remove any word inside []
  text = re.sub('\[.*?\]', '', text)

  # remove any punctuation
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

  # remove any word has digits
  text = re.sub('\w*\d\w*', '', text)

  # remove any non alphanumeric character
  text = re.sub(r'\W', ' ', text)

  # remove any white space
  text = re.sub(' +', ' ', text)

  # remove any single character
  text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)

  doc = nlp(text)
  text = ' '.join([token.lemma_ for token in doc if not token.is_stop])

  return text

In [22]:
df['Article'][0]

'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long called for similar or more severe measures could scarcely bring themselves to approve. House Speaker Paul Ryan called the Obama measures ”appropriate” but also ”overdue” and ”a prime example of this administration’s ineffective foreign policy that has left America weaker in the eyes of the world.” Other GOP leaders sounded much the same theme. ”[We have] been urging President Obama for years to take strong action to deter Russia’s worldwide aggression, including its   operations,” wrote Rep. Devin Nunes,  . chairman of the House Intelligence Committee. ”Now with just a few weeks left in office, the president has suddenly decided that some stronger measures are indeed warranted.” Appearing 

In [23]:
text = clean_txt(df['Article'][0])
text

'washington policy bipartisan politic sense year show little sign end dec president obama move sanction russia allege interference s election conclude republicans long call similar severe measure scarcely bring approve house speaker paul ryan call obama measure appropriate overdue prime example administration ineffective foreign policy leave america weak eye world gop leader sound theme urge president obama year strong action deter russia worldwide aggression include operation write rep devin nune chairman house intelligence committee week leave office president suddenly decide strong measure warrant appear cnn frequent obama critic trent frank call tough action say time obama finally find tongue fox news spokesman trump say obama real target russians man poise white house week speak obama try tie trump hand box mean force sanction odd republican want tough moscow trump repeatedly call sanction close tie russia include cooperation fight isis russia battle isis syria behalf country emba

In [24]:
df['Article'] = df['Article'].apply(clean_txt)

In [25]:
df.head()

Unnamed: 0,Article
0,washington policy bipartisan politic sense yea...
1,donald trump twitter preferred mean communic...
2,donald trump unabashedly praise russian pres...
3,update m et russian president vladimir putin s...
4,photography illustration video data visualizat...


In [26]:
df.shape

(11992, 1)

#LDA

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

# max_df -> Words appearing in more than 95% of documents are ignored (too frequent).
# min_df -> Words appearing in fewer than 2 documents are ignored (too rare).
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = cv.fit_transform(df['Article'])

In [32]:
len(cv.get_feature_names_out())

41869

In [33]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=7, random_state=42)
LDA.fit(dtm)

In [34]:
for i in range(7):
  print(f'TOPIC #{i}')
  print([cv.get_feature_names_out()[index] for index in LDA.components_[i].argsort()[-15:]])
  print('\n')

TOPIC #0
['pay', 'woman', 'work', 'federal', 'program', 'public', 'new', 'education', 'law', 'report', 'year', 'state', 'student', 'school', 'say']


TOPIC #1
['play', 'way', 'album', 'day', 'know', 'work', 'food', 'come', 'time', 'song', 'music', 'new', 'year', 'like', 'say']


TOPIC #2
['campaign', 'state', 'people', 'white', 'law', 'clinton', 'tell', 'court', 'obama', 'report', 'police', 'house', 'president', 'say', 'trump']


TOPIC #3
['work', 'come', 'don', 'feel', 'life', 'want', 'way', 'woman', 'time', 'thing', 'know', 'think', 'say', 'people', 'like']


TOPIC #4
['country', 'come', 'change', 'big', 'china', 'time', 'report', 'like', 'world', 'water', 'new', 'company', 'people', 'year', 'say']


TOPIC #5
['hospital', 'doctor', 'work', 'medical', 'child', 'disease', 'drug', 'percent', 'patient', 'year', 'study', 'care', 'people', 'health', 'say']


TOPIC #6
['election', 'new', 'candidate', 'party', 'voter', 'campaign', 'win', 'year', 'people', 'country', 'vote', 'clinton', 'state

In [36]:
topic_res = LDA.transform(dtm)

In [37]:
topic_res.shape

(11992, 7)

In [39]:
topic_res[:3].round(2)

array([[0.  , 0.  , 0.82, 0.  , 0.  , 0.  , 0.18],
       [0.  , 0.  , 0.7 , 0.  , 0.09, 0.  , 0.21],
       [0.  , 0.  , 0.96, 0.  , 0.  , 0.  , 0.04]])

In [40]:
topic_res.argmax(axis=1)

array([2, 2, 2, ..., 5, 6, 6])

#NMF

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(df['Article'])

In [42]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=7, random_state=42)
nmf.fit(dtm)

In [47]:
len(tfidf.get_feature_names_out())

41869

In [48]:
for i in range(7):
  print(f'TOPIC #{i}')
  print([tfidf.get_feature_names_out()[index] for index in nmf.components_[i].argsort()[-15:]])
  print('\n')

TOPIC #0
['feel', 'year', 'book', 'way', 'work', 'want', 'time', 'life', 'thing', 'know', 'think', 'woman', 'like', 'people', 'say']


TOPIC #1
['business', 'presidential', 'comey', 'election', 'russia', 'administration', 'republican', 'obama', 'white', 'donald', 'say', 'house', 'campaign', 'president', 'trump']


TOPIC #2
['affordable', 'tax', 'percent', 'obamacare', 'cost', 'people', 'drug', 'plan', 'say', 'coverage', 'patient', 'medicaid', 'insurance', 'care', 'health']


TOPIC #3
['primary', 'republican', 'party', 'election', 'delegate', 'hillary', 'democratic', 'candidate', 'win', 'campaign', 'state', 'voter', 'vote', 'sander', 'clinton']


TOPIC #4
['official', 'isis', 'force', 'president', 'department', 'city', 'state', 'law', 'government', 'officer', 'court', 'attack', 'report', 'police', 'say']


TOPIC #5
['university', 'say', 'district', 'devos', 'high', 'state', 'parent', 'program', 'kid', 'child', 'college', 'teacher', 'education', 'school', 'student']


TOPIC #6
['desk', '

In [49]:
topic_res = nmf.transform(dtm)

In [50]:
topic_res.argmax(axis=1)

array([1, 1, 1, ..., 0, 3, 4])