In [76]:
import pandas as pd
from sklearn.utils import resample
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ayamohammed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ayamohammed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ayamohammed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ayamohammed/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [78]:
# Reading the JSON file containing news articles data
df = pd.read_json("data/News_Category_Dataset_v3.json", lines=True)

# Displaying the first few rows of the loaded dataframe(understanding the structure and content of the data)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [79]:
# Selecting only the 'headline' and 'category' columns for our text classification task
df = df[["headline", "category"]]

# Displaying the filtered dataframe to verify the changes
df

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS
...,...,...
209522,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH
209523,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS
209524,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS
209525,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS


In [80]:
print("Dataset shape is : ",df.shape)
print("The number of rows is : ",df.shape[0])
print("The number of columns is : ",df.shape[1])
print("The unique categories are : ",df['category'].unique())

Dataset shape is :  (209527, 2)
The number of rows is :  209527
The number of columns is :  2
The unique categories are :  ['U.S. NEWS' 'COMEDY' 'PARENTING' 'WORLD NEWS' 'CULTURE & ARTS' 'TECH'
 'SPORTS' 'ENTERTAINMENT' 'POLITICS' 'WEIRD NEWS' 'ENVIRONMENT'
 'EDUCATION' 'CRIME' 'SCIENCE' 'WELLNESS' 'BUSINESS' 'STYLE & BEAUTY'
 'FOOD & DRINK' 'MEDIA' 'QUEER VOICES' 'HOME & LIVING' 'WOMEN'
 'BLACK VOICES' 'TRAVEL' 'MONEY' 'RELIGION' 'LATINO VOICES' 'IMPACT'
 'WEDDINGS' 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'DIVORCE']


In [81]:
# There is Imbalance Impact
df['category'].value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [82]:
# Get the count of samples in the majority class
max_samples = df['category'].value_counts().max()

# Create an empty list to store upsampled dataframes
df_upsampled = pd.DataFrame()

# Loop through each category and upsample
for category in df['category'].unique():
    # Get the samples for the current category
    category_samples = df[df['category'] == category]
    
    # Calculate how many samples to generate (upsample to match the majority class)
    n_samples = max_samples - len(category_samples)
    
    if n_samples > 0:  # Only upsample if the category has fewer samples than the majority
        # Perform upsampling with replacement
        upsampled = resample(category_samples,
                           replace=True,      # Sample with replacement
                           n_samples=n_samples,  # Number of samples to generate
                           random_state=42) 
        
        # Combine the original and upsampled data
        df_upsampled = pd.concat([df_upsampled, category_samples, upsampled])
    else:
        # If the category already has the maximum number of samples, just add it
        df_upsampled = pd.concat([df_upsampled, category_samples])


# Shuffle the dataset to mix up the order of samples
df_balanced = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)


In [83]:
# Verify the new class distribution
print("Balanced dataset class distribution:")
print(df_balanced['category'].value_counts())

# Check the new shape of the dataset
print("\nOriginal dataset shape:", df.shape)
print("Balanced dataset shape:", df_balanced.shape)

Balanced dataset class distribution:
category
STYLE             35602
TRAVEL            35602
COLLEGE           35602
WOMEN             35602
POLITICS          35602
QUEER VOICES      35602
PARENTING         35602
HEALTHY LIVING    35602
BUSINESS          35602
EDUCATION         35602
FOOD & DRINK      35602
WEDDINGS          35602
WORLDPOST         35602
ARTS & CULTURE    35602
ENTERTAINMENT     35602
WELLNESS          35602
RELIGION          35602
DIVORCE           35602
LATINO VOICES     35602
BLACK VOICES      35602
THE WORLDPOST     35602
STYLE & BEAUTY    35602
SPORTS            35602
CRIME             35602
PARENTS           35602
FIFTY             35602
SCIENCE           35602
WORLD NEWS        35602
GREEN             35602
GOOD NEWS         35602
TASTE             35602
MONEY             35602
CULTURE & ARTS    35602
HOME & LIVING     35602
ARTS              35602
ENVIRONMENT       35602
TECH              35602
MEDIA             35602
COMEDY            35602
WEIRD NEWS        

In [84]:
def preprocess_text(text):
    """
    Preprocesses a single text string by performing the following steps:
    1. Convert to lowercase
    2. Remove special characters and numbers
    3. Tokenize the text
    4. Remove stopwords
    5. Lemmatize words
    6. Join tokens back into a single string
    """
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Remove single character tokens
    tokens = [word for word in tokens if len(word) > 1]
    
    # Join tokens back to string
    return ' '.join(tokens)

In [85]:
# Apply preprocessing to the 'headline' column
print("Starting text preprocessing...")
df_balanced['processed_headline'] = df_balanced['headline'].apply(preprocess_text)

Starting text preprocessing...


In [86]:
# Show before and after examples
print("\nOriginal vs Processed Text Examples:")
for i in range(3):
    print(f"\nOriginal: {df_balanced['headline'].iloc[i]}")
    print(f"Processed: {df_balanced['processed_headline'].iloc[i]}")
    print("-" * 80)


Original vs Processed Text Examples:

Original: Bad Beard Days Happen, So Here's How To Handle Them
Processed: bad beard day happen here handle
--------------------------------------------------------------------------------

Original: Michelle Kwan Wedding Dress: Olympic Skater Wants Vera Wang To Make Her Wedding Gown
Processed: michelle kwan wedding dress olympic skater want vera wang make wedding gown
--------------------------------------------------------------------------------

Original: Dad Life Is Totally Metal
Processed: dad life totally metal
--------------------------------------------------------------------------------


In [87]:
# Drop the 'headline' column
df_balanced.drop(columns=['headline'], inplace=True)

In [88]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Create a new column with numerical labels
df_balanced['category_encoded'] = label_encoder.fit_transform(df_balanced['category'])

# Create a mapping dictionary for reference
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Display the mapping
print("Category to Number Mapping:")
for category, num in sorted(category_mapping.items(), key=lambda x: x[1]):
    print(f"{category}: {num}")


Category to Number Mapping:
ARTS: 0
ARTS & CULTURE: 1
BLACK VOICES: 2
BUSINESS: 3
COLLEGE: 4
COMEDY: 5
CRIME: 6
CULTURE & ARTS: 7
DIVORCE: 8
EDUCATION: 9
ENTERTAINMENT: 10
ENVIRONMENT: 11
FIFTY: 12
FOOD & DRINK: 13
GOOD NEWS: 14
GREEN: 15
HEALTHY LIVING: 16
HOME & LIVING: 17
IMPACT: 18
LATINO VOICES: 19
MEDIA: 20
MONEY: 21
PARENTING: 22
PARENTS: 23
POLITICS: 24
QUEER VOICES: 25
RELIGION: 26
SCIENCE: 27
SPORTS: 28
STYLE: 29
STYLE & BEAUTY: 30
TASTE: 31
TECH: 32
THE WORLDPOST: 33
TRAVEL: 34
U.S. NEWS: 35
WEDDINGS: 36
WEIRD NEWS: 37
WELLNESS: 38
WOMEN: 39
WORLD NEWS: 40
WORLDPOST: 41


In [89]:
print("\nFirst few rows with encoded categories:")
print(df_balanced[['category', 'category_encoded']].head())


First few rows with encoded categories:
   category  category_encoded
0     STYLE                29
1  WEDDINGS                36
2   PARENTS                23
3     FIFTY                12
4   SCIENCE                27


When you use **stratify**, you're telling train_test_split to:

Look at the original proportions of each category in your data

Maintain those exact same proportions when creating both the training and test sets

In [90]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced['processed_headline'], df_balanced['category_encoded'], test_size=0.2, random_state=42,stratify=df_balanced['category_encoded'])

In [91]:
print("Original distribution:\n", df_balanced['category_encoded'].value_counts(normalize=True))
print("\nTrain distribution:\n", y_train.value_counts(normalize=True))
print("\nTest distribution:\n", y_test.value_counts(normalize=True))

Original distribution:
 category_encoded
29    0.02381
34    0.02381
4     0.02381
39    0.02381
24    0.02381
25    0.02381
22    0.02381
16    0.02381
3     0.02381
9     0.02381
13    0.02381
36    0.02381
41    0.02381
1     0.02381
10    0.02381
38    0.02381
26    0.02381
8     0.02381
19    0.02381
2     0.02381
33    0.02381
30    0.02381
28    0.02381
6     0.02381
23    0.02381
12    0.02381
27    0.02381
40    0.02381
15    0.02381
14    0.02381
31    0.02381
21    0.02381
7     0.02381
17    0.02381
0     0.02381
11    0.02381
32    0.02381
20    0.02381
5     0.02381
37    0.02381
18    0.02381
35    0.02381
Name: proportion, dtype: float64

Train distribution:
 category_encoded
19    0.023810
15    0.023810
16    0.023810
35    0.023810
33    0.023810
8     0.023810
17    0.023810
12    0.023810
39    0.023810
5     0.023810
40    0.023810
24    0.023810
30    0.023810
11    0.023810
27    0.023810
13    0.023810
36    0.023810
29    0.023810
25    0.023810
6     0.023810

In [92]:
clf_BOW = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 1))),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

clf_BOW.fit(X_train, y_train)
y_pred = clf_BOW.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96      7120
           1       0.97      0.99      0.98      7120
           2       0.89      0.89      0.89      7121
           3       0.84      0.79      0.81      7121
           4       0.94      0.98      0.96      7120
           5       0.88      0.83      0.85      7120
           6       0.94      0.96      0.95      7120
           7       0.97      0.99      0.98      7121
           8       0.92      0.92      0.92      7120
           9       0.93      0.98      0.95      7120
          10       0.88      0.69      0.77      7121
          11       0.96      0.99      0.97      7120
          12       0.82      0.91      0.86      7120
          13       0.87      0.87      0.87      7120
          14       0.94      0.98      0.96      7121
          15       0.92      0.95      0.93      7120
          16       0.69      0.66      0.67      7120
          17       0.93    

In [None]:
clf_BI = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

clf_BI.fit(X_train, y_train)
y_pred = clf_BI.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
clf_TRI = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 3))),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

clf_TRI.fit(X_train, y_train)
y_pred = clf_TRI.predict(X_test)
print(classification_report(y_test, y_pred))

KeyboardInterrupt: 