In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json('News_Category_Dataset_v3.json',lines=True)

In [3]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
df.shape

(209527, 6)

In [5]:
df['BigString'] = df.apply(lambda x: x['headline']+" "+x['short_description'],axis=1)

In [6]:
df[['link', 'BigString']].head()

Unnamed: 0,link,BigString
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li..."
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...


## LDA

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv = CountVectorizer(min_df = 0.011,
                     max_df=0.95, 
                     stop_words='english', 
                     lowercase=True, 
                     max_features=100)
doc_matrix = cv.fit_transform(df['BigString'])

In [11]:
cv.get_feature_names_out()

array(['10', 'america', 'american', 'best', 'better', 'big', 'black',
       'care', 'change', 'child', 'children', 'city', 'clinton', 'come',
       'country', 'day', 'days', 'did', 'does', 'doesn', 'don', 'donald',
       'family', 'feel', 'food', 'getting', 'going', 'good', 'gop', 'got',
       'great', 'health', 'help', 'high', 'home', 'house', 'huffpost',
       'just', 'kids', 'know', 'let', 'life', 'like', 'little', 'live',
       'll', 'long', 'look', 'love', 'make', 'making', 'man', 'men',
       'need', 'new', 'news', 'night', 'obama', 'old', 'parents',
       'people', 'photo', 'photos', 'police', 'president', 'real',
       'really', 'right', 'said', 'say', 'says', 'school', 'star',
       'state', 'study', 'style', 'thing', 'things', 'think', 'time',
       'today', 'trump', 'twitter', 've', 'video', 'want', 'watch', 'way',
       'ways', 'wedding', 'week', 'white', 'woman', 'women', 'won',
       'work', 'world', 'year', 'years', 'york'], dtype=object)

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
lda_model = LatentDirichletAllocation(n_components=5, 
                                      random_state=12345)

In [14]:
lda_model.fit(doc_matrix)

In [15]:
lda_model.components_[0].argsort()[-10:]

array([11, 30,  8,  6,  2, 91,  1, 51, 35, 54])

In [16]:
feature_names = cv.get_feature_names_out()

In [17]:
feature_names

array(['10', 'america', 'american', 'best', 'better', 'big', 'black',
       'care', 'change', 'child', 'children', 'city', 'clinton', 'come',
       'country', 'day', 'days', 'did', 'does', 'doesn', 'don', 'donald',
       'family', 'feel', 'food', 'getting', 'going', 'good', 'gop', 'got',
       'great', 'health', 'help', 'high', 'home', 'house', 'huffpost',
       'just', 'kids', 'know', 'let', 'life', 'like', 'little', 'live',
       'll', 'long', 'look', 'love', 'make', 'making', 'man', 'men',
       'need', 'new', 'news', 'night', 'obama', 'old', 'parents',
       'people', 'photo', 'photos', 'police', 'president', 'real',
       'really', 'right', 'said', 'say', 'says', 'school', 'star',
       'state', 'study', 'style', 'thing', 'things', 'think', 'time',
       'today', 'trump', 'twitter', 've', 'video', 'want', 'watch', 'way',
       'ways', 'wedding', 'week', 'white', 'woman', 'women', 'won',
       'work', 'world', 'year', 'years', 'york'], dtype=object)

In [18]:
for i in range(5):
    words = lda_model.components_[i].argsort()[-10:]
    #print(words)
    print(f"Topic {i+1}")
    wd = []
    for w in words:
        wd.append(feature_names[w])
    print(" ".join(wd))
    print("\n")

Topic 1
city great change black american white america man house new


Topic 2
study style wedding way help good ve week day photos


Topic 3
right life things need best don know world like people


Topic 4
care big school obama children make kids says health just


Topic 5
say old time president years love donald women year trump




In [21]:
df.category.unique()

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

In [22]:
df.category.value_counts()

POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATION       