In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk

In [2]:
df = pd.read_json("dataset//News_Category_Dataset_v3.json", lines=True)
print(df.head())

                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/funniest-tweets...   
3  https://www.huffpost.com/entry/funniest-parent...   
4  https://www.huffpost.com/entry/amy-cooper-lose...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog y

In [3]:
df = df.drop(columns=["link", "date", "authors"])

In [4]:
df

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...
...,...,...,...
209522,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...
209523,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr..."
209524,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked..."
209525,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...


In [5]:
df.isna().sum()

headline             0
category             0
short_description    0
dtype: int64

In [6]:
print(df["headline"][0])
print(df.info())
print(df["category"].value_counts())
df["category"].count()

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   headline           209527 non-null  object
 1   category           209527 non-null  object
 2   short_description  209527 non-null  object
dtypes: object(3)
memory usage: 4.8+ MB
None
category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS     

209527

In [7]:
df_limited = df.groupby('category').head(5000)


In [8]:
df_limited["category"].value_counts()
# df_limited["category"].count()

category
QUEER VOICES      5000
ENTERTAINMENT     5000
BUSINESS          5000
WELLNESS          5000
COMEDY            5000
TRAVEL            5000
FOOD & DRINK      5000
POLITICS          5000
SPORTS            5000
HEALTHY LIVING    5000
PARENTING         5000
STYLE & BEAUTY    5000
BLACK VOICES      4583
HOME & LIVING     4320
PARENTS           3955
THE WORLDPOST     3664
WEDDINGS          3653
WOMEN             3572
CRIME             3562
IMPACT            3484
DIVORCE           3426
WORLD NEWS        3299
MEDIA             2944
WEIRD NEWS        2777
GREEN             2622
WORLDPOST         2579
RELIGION          2577
STYLE             2254
SCIENCE           2206
TECH              2104
TASTE             2096
MONEY             1756
ARTS              1509
ENVIRONMENT       1444
FIFTY             1401
GOOD NEWS         1398
U.S. NEWS         1377
ARTS & CULTURE    1339
COLLEGE           1144
LATINO VOICES     1130
CULTURE & ARTS    1074
EDUCATION         1014
Name: count, dtype: int64

In [9]:
categories_to_remove = ["QUEER VOICES", "POLITICS", "IMPACT", "DIVORCE", "PARENTS", "WEIRD NEWS", "COLLEGE", \
        "LATINO VOICES", "CULTURE & ARTS", "BLACK VOICES", "GREEN", "TASTE", "WEDDINGS", "U.S. NEWS", "FIFTY", "ARTS" \
            "WOMEN",'SCIENCE', 'WELLNESS', 'BUSINESS' \
, 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA', 'HOME & LIVING', 'WOMEN', 'TRAVEL', \
 'MONEY', 'RELIGION', 'ARTS & CULTURE', 'STYLE', 'HEALTHY LIVING', \
 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST', 'ARTS' \
    ,'COMEDY', 'PARENTING', 'WORLD NEWS', "ENTERTAINMENT", \
 'ENVIRONMENT', "SPORTS", 'CRIME', 'HEALTHY LIVING', "TECH", \
    ]

df_limited = df_limited[~df_limited['category'].isin(categories_to_remove)]
df_limited["category"].value_counts()

unique_categories = df_limited['category'].unique()
print(unique_categories)

['SPORTS']


In [10]:
nlp = spacy.load('en_core_web_sm')

stemmer = nltk.PorterStemmer()

def cleaning_text(text):

    # print(text)
    text = text.lower()
    doc = nlp(text)

    new_line = []

    for w in doc :
        if(w.is_stop == False and w.is_punct == False and w.is_space == False):
            new_line.append(w.lemma_)
    new_line = " ".join(new_line)

    # next = []

    # for word in new_line :
    #     next.append(stemmer.stem(word))
    
    # new_line = " ".join(next)

    return new_line

In [11]:
df_limited["short_description"] = df_limited['short_description'].apply(cleaning_text)

In [12]:
print(df_limited.head())

                                             headline category  \
17  Maury Wills, Base-Stealing Shortstop For Dodge...   SPORTS   
26  Las Vegas Aces Win First WNBA Title, Chelsea G...   SPORTS   
61  Boston Marathon To Make Race More Inclusive Fo...   SPORTS   
62  Anthony Varvaro, MLB Pitcher Turned Transit Co...   SPORTS   
67  Carlos Alcaraz Wins U.S. Open For 1st Slam Tit...   SPORTS   

                                    short_description  
17  maury will help los angeles dodger win world s...  
26       las vegas professional sport champion sunday  
61  race organizer nonbinary athlete will register...  
62  varvaro pitch atlanta brave start law enforcem...  
67  carlos alcaraz defeat casper ruud u.s open fin...  


In [16]:
df_limited["short_description"][17]

'maury will help los angeles dodger win world series title base steal prowess die'