## `Step1 ---> Data Cleaning`

##### `Import Libraries`

In [80]:
import pandas as pd

#### `Read Data and convert it to CSV file`

In [81]:
# Load Data
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)  # Use lines=True for line-delimited JSON

# Save to df
df.to_csv('category.csv', index=False)


#### `Data Cleaning and preprocessing`

In [82]:
df=pd.read_csv('category.csv')
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [83]:
## drop cols 
df.drop(columns=['link','short_description','authors','date'],axis=1,inplace=True)

In [84]:
df.head()

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  209521 non-null  object
 1   category  209527 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


In [86]:
df.shape

(209527, 2)

In [87]:
## Check nulls 

df.isna().sum()

headline    6
category    0
dtype: int64

In [88]:
## Drop nulls
df.dropna(inplace=True)


In [89]:
## Check nulls after drop it
df.isna().sum()

headline    0
category    0
dtype: int64

In [90]:
## Check Duplicates
df.duplicated().sum()

np.int64(1419)

In [91]:
## Drop Duplicates
df.drop_duplicates(inplace=True)

In [92]:
## Check after drop
df.duplicated().sum()

np.int64(0)

In [93]:
# Identify duplicates in headline col 

duplicates=df[df['headline'].duplicated(keep=False)]
duplicates


Unnamed: 0,headline,category
7645,Here Are The Movies Coming To Netflix This Week,HOME & LIVING
8457,Here Are The Movies Coming To Netflix This Week,ENTERTAINMENT
18333,Gingerbread Dessert Recipes For The Best Holid...,TASTE
32378,Why We March,BLACK VOICES
33649,How Successful People Stay Calm,BUSINESS
...,...,...
207717,The Widower,DIVORCE
208182,"Kelly Rowland's Birthday: Singer Turns 31, Sty...",STYLE & BEAUTY
208217,"Kelly Rowland's Birthday: Singer Turns 31, Sty...",BLACK VOICES
208763,"NFL Homes: Amani Toomer, Muhammed Wilkerson, V...",STYLE & BEAUTY


In [94]:
# Drop all rows with duplicates
df = df[~df['headline'].duplicated(keep=False)]

In [95]:
## Check after drop it 
df[df['headline'].duplicated(keep=False)]

Unnamed: 0,headline,category


In [96]:
df['category'].unique()

array(['U.S. NEWS', 'COMEDY', 'PARENTING', 'WORLD NEWS', 'CULTURE & ARTS',
       'TECH', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WEIRD NEWS',
       'ENVIRONMENT', 'EDUCATION', 'CRIME', 'SCIENCE', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'MEDIA',
       'QUEER VOICES', 'HOME & LIVING', 'WOMEN', 'BLACK VOICES', 'TRAVEL',
       'MONEY', 'RELIGION', 'LATINO VOICES', 'IMPACT', 'WEDDINGS',
       'COLLEGE', 'PARENTS', 'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE',
       'HEALTHY LIVING', 'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST',
       'FIFTY', 'ARTS', 'DIVORCE'], dtype=object)

In [97]:
categories_to_keep = [
    "POLITICS", "WELLNESS", "ENTERTAINMENT", "TRAVEL",
    "STYLE & BEAUTY", "QUEER VOICES", "FOOD & DRINK",
    "BUSINESS", "SPORTS"
]


df['category'] = df['category'].apply(lambda x: x if x in categories_to_keep else "OTHER")

In [98]:
df['category'].value_counts()

category
OTHER             94350
POLITICS          35467
WELLNESS          17862
ENTERTAINMENT     17317
TRAVEL             9871
STYLE & BEAUTY     9313
QUEER VOICES       6335
FOOD & DRINK       6330
BUSINESS           5974
SPORTS             5074
Name: count, dtype: int64

In [99]:
df.head(10)

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,OTHER
1,"American Airlines Flyer Charged, Banned For Li...",OTHER
2,23 Of The Funniest Tweets About Cats And Dogs ...,OTHER
3,The Funniest Tweets From Parents This Week (Se...,OTHER
4,Woman Who Called Cops On Black Bird-Watcher Lo...,OTHER
5,Cleaner Was Dead In Belk Bathroom For 4 Days B...,OTHER
6,Reporter Gets Adorable Surprise From Her Boyfr...,OTHER
7,Puerto Ricans Desperate For Water After Hurric...,OTHER
8,How A New Documentary Captures The Complexity ...,OTHER
9,Biden At UN To Call Russian War An Affront To ...,OTHER


In [100]:
df.tail(10)

Unnamed: 0,headline,category
209516,Allard Van Hoorn's 'Urban Songline' Explores R...,OTHER
209517,Good Games -- Is It possible?,OTHER
209518,Google+ Now Open for Teens With Some Safeguards,OTHER
209519,Web Wars,OTHER
209520,"First White House Chief Technology Officer, An...",OTHER
209522,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,OTHER
209523,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS
209524,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS
209525,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS
209526,Dwight Howard Rips Teammates After Magic Loss ...,SPORTS


##### `Cleaning the Text`

In [101]:

import re

def clean_text(text):
    text = re.sub(r'\b\w*\d\w*\b', '', text)  # Remove words with numbers
    text = re.sub(r'(?:@\S+|#\S+|http\S+)', '', text)  # Remove URLs, hashtags, mentions
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Collapse extra whitespaces
    text = re.sub(r'\[[^()]*\]', '', text)
    text = re.sub(r'\d+', '', text)# Remove digits
    return text

In [102]:
## Before
df['headline'].head(30).value_counts()

headline
Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters                             1
American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video                1
23 Of The Funniest Tweets About Cats And Dogs This Week (Sept. 17-23)                                    1
The Funniest Tweets From Parents This Week (Sept. 17-23)                                                 1
Woman Who Called Cops On Black Bird-Watcher Loses Lawsuit Against Ex-Employer                            1
Cleaner Was Dead In Belk Bathroom For 4 Days Before Body Found: Police                                   1
Reporter Gets Adorable Surprise From Her Boyfriend While Live On TV                                      1
Puerto Ricans Desperate For Water After Hurricane Fiona’s Rampage                                        1
How A New Documentary Captures The Complexity Of Being A Child Of Immigrants                             1
Biden At UN To Call Russian 

In [103]:
# Applr clean function
df['headline'] = df['headline'].apply(clean_text)

In [104]:
df['headline'].head(30).value_counts()

headline
Over Million Americans Roll Up Sleeves For OmicronTargeted COVID Boosters                             1
American Airlines Flyer Charged Banned For Life After Punching Flight Attendant On Video              1
Of The Funniest Tweets About Cats And Dogs This Week Sept                                             1
The Funniest Tweets From Parents This Week Sept                                                       1
Woman Who Called Cops On Black BirdWatcher Loses Lawsuit Against ExEmployer                           1
Cleaner Was Dead In Belk Bathroom For Days Before Body Found Police                                   1
Reporter Gets Adorable Surprise From Her Boyfriend While Live On TV                                   1
Puerto Ricans Desperate For Water After Hurricane Fionas Rampage                                      1
How A New Documentary Captures The Complexity Of Being A Child Of Immigrants                          1
Biden At UN To Call Russian War An Affront To Bodys Cha

In [105]:
df['category'].unique()

array(['OTHER', 'SPORTS', 'ENTERTAINMENT', 'POLITICS', 'WELLNESS',
       'BUSINESS', 'STYLE & BEAUTY', 'FOOD & DRINK', 'QUEER VOICES',
       'TRAVEL'], dtype=object)

In [106]:
df['category'].value_counts()

category
OTHER             94350
POLITICS          35467
WELLNESS          17862
ENTERTAINMENT     17317
TRAVEL             9871
STYLE & BEAUTY     9313
QUEER VOICES       6335
FOOD & DRINK       6330
BUSINESS           5974
SPORTS             5074
Name: count, dtype: int64

In [107]:
df_other=df[df['category']=='OTHER'].sample(n=10000,random_state=42)
df_politics=df[df['category']=='POLITICS'].sample(n=10000,random_state=42)
df_wellness=df[df['category']=='WELLNESS'].sample(n=10000,random_state=42)
df_entertainment=df[df['category']=='ENTERTAINMENT'].sample(n=10000,random_state=42)


In [108]:
categories_to_drop = ['OTHER', 'POLITICS', 'WELLNESS', 'ENTERTAINMENT']

indexes_to_drop = df[df['category'].isin(categories_to_drop)].index

df.drop(indexes_to_drop,inplace=True,axis=0)

df.shape

(42897, 2)

In [109]:
df_new= pd.concat([df, df_other, df_politics, df_wellness, df_entertainment])


df_new.shape

(82897, 2)

In [110]:
df_new['category'].value_counts()


category
WELLNESS          10000
ENTERTAINMENT     10000
POLITICS          10000
OTHER             10000
TRAVEL             9871
STYLE & BEAUTY     9313
QUEER VOICES       6335
FOOD & DRINK       6330
BUSINESS           5974
SPORTS             5074
Name: count, dtype: int64

##### `Split Data`

In [111]:
X=df_new['headline']
y=df_new['category']

In [112]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y= label_encoder.fit_transform(y)

In [113]:
labels=label_encoder.classes_

In [114]:
# Display the mapping
label_mapping = {label: index for index, label in enumerate(labels)}
print("Label Mapping:", label_mapping)

Label Mapping: {'BUSINESS': 0, 'ENTERTAINMENT': 1, 'FOOD & DRINK': 2, 'OTHER': 3, 'POLITICS': 4, 'QUEER VOICES': 5, 'SPORTS': 6, 'STYLE & BEAUTY': 7, 'TRAVEL': 8, 'WELLNESS': 9}


In [115]:
from sklearn.model_selection import train_test_split

# Split the data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


train_df = pd.DataFrame(X_train)
train_df['category'] = y_train

test_df = pd.DataFrame(X_test)
test_df['category'] = y_test



In [116]:
# Save the DataFrames as CSV files
train_df.to_csv('train_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)

In [117]:
## Check shapes

print('x_train shape :',X_train.shape)
print('y_train shape :',y_train.shape)
print('*'*20)

print('x_test shape :',X_test.shape)
print('y_test shape :',y_test.shape)

x_train shape : (66317,)
y_train shape : (66317,)
********************
x_test shape : (16580,)
y_test shape : (16580,)


In [118]:
### Check Duplicates after split 
print('Num of duplicates after splitting for Train Data :', train_df.duplicated().sum())


print('Num of duplicates after splitting for Test Data :', test_df.duplicated().sum())



Num of duplicates after splitting for Train Data : 156
Num of duplicates after splitting for Test Data : 17


In [119]:
## Drop This Duplicates
train_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

In [120]:
X_train

197576              Our Most Fearless Tweet Finalist PHOTOS
203877                                         Hunger Hurts
25400     Perfect Tweets About Bachelor In Paradise Seas...
142270                        Nuh Linga Get Down to Jamaica
153990    Airplane Boneyards Look Even Cooler In Instagr...
                                ...                        
120446    Osteochondral Ankle Surgery Is This What ShinS...
131153                         Americas Most Damaged Brands
191838    Grief and Loss Tips on How We Can Help Those A...
83359     Beyonce Taylor Swift And Other Celebrities Sen...
111909       Hillary President The Elephant in the RoomBill
Name: headline, Length: 66317, dtype: object