In [1]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [7]:
import pandas as pd

df = pd.read_json('C:/Users/TopTechnology/Desktop/NLP Project/News_Category_Dataset_v3.json', lines=True)[['headline', 'category']]

print(df.shape)

(209527, 2)


In [14]:
df.head()

Unnamed: 0,headline,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [15]:
df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [16]:
desired_categories = ['CRIME', 'COMEDY', 'EDUCATION', 'SPORTS']
df_new = df[df['category'].isin(desired_categories)]
df_new.head()

Unnamed: 0,headline,category
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
17,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS
26,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS
61,Boston Marathon To Make Race More Inclusive Fo...,SPORTS
62,"Anthony Varvaro, MLB Pitcher Turned Transit Co...",SPORTS


In [17]:
print(df_new.headline.shape)
print(df_new.category.shape)

(15053,)
(15053,)


In [18]:
df_new.category.value_counts()

category
COMEDY       5400
SPORTS       5077
CRIME        3562
EDUCATION    1014
Name: count, dtype: int64

In [19]:
min_samples = 1014 # we have these many EDUCATION articles 


df_comedy = df_new[df_new.category=="COMEDY"].sample(min_samples, random_state=2022)
df_sports = df_new[df_new.category=="SPORTS"].sample(min_samples, random_state=2022)
df_crime = df_new[df_new.category=="CRIME"].sample(min_samples, random_state=2022)
df_education = df_new[df_new.category=="EDUCATION"].sample(min_samples, random_state=2022)

In [21]:
df_balanced = pd.concat([df_comedy,df_sports,df_crime,df_education],axis=0)
df_balanced.category.value_counts()

category
COMEDY       1014
SPORTS       1014
CRIME        1014
EDUCATION    1014
Name: count, dtype: int64

In [22]:
df_balanced['category_num'] = df_balanced['category'].map({
    'COMEDY': 0,
    'SPORTS': 1, 
    'CRIME': 2, 
    'EDUCATION': 3
})

In [23]:
df_balanced.head()

Unnamed: 0,headline,category,category_num
15312,"Trevor Noah Calls Out Hillary Clinton, And He'...",COMEDY,0
126777,What Sorcery Is This?,COMEDY,0
47238,"Hi, Death? I'd Like To Speak To Your Manager, ...",COMEDY,0
119663,...New Again?,COMEDY,0
30724,"There Was No Audio, So We Captioned The Trump ...",COMEDY,0


In [24]:
df_balanced.tail()

Unnamed: 0,headline,category,category_num
111169,The Global Search for Education: Our Global To...,EDUCATION,3
106643,Reflections of an Octogenarian IV: Whatever Ha...,EDUCATION,3
32087,"For The Future Of Education, Answer The Callin...",EDUCATION,3
37277,America's Charter Schools Have A Commitment Pr...,EDUCATION,3
125885,3 Tips for Using Video Interviews to Hire Grea...,EDUCATION,3


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.headline, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023,
    stratify=df_balanced.category_num
)

In [26]:
print(X_train.shape)
X_train.head()

(3244,)


1506      Prosecution Rests In Sex-abuse Trial Of Ghisla...
92574     Jury Moves Closer To Death Penalty In James Ho...
47816     In A Crappy Year, These Sports Moments Brought...
92386     Shots Reported For 2nd Day At Mississippi Mili...
113680                              Stop Lying to Yourself!
Name: headline, dtype: object

In [27]:
y_train.value_counts()

category_num
2    811
1    811
3    811
0    811
Name: count, dtype: int64

In [28]:
y_test.value_counts()

category_num
1    203
3    203
2    203
0    203
Name: count, dtype: int64

In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))), #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.81      0.84       203
           1       0.82      0.79      0.80       203
           2       0.87      0.91      0.89       203
           3       0.84      0.89      0.86       203

    accuracy                           0.85       812
   macro avg       0.85      0.85      0.85       812
weighted avg       0.85      0.85      0.85       812



In [30]:
X_test[:8]

65762    11 Law Professors Say Tom Brady Is Right And T...
19832                  Florida's Teacher Gap Is No Mystery
85480    Richard Sherman Explains A Weekly Contradictio...
20592    Teacher Seniority: The Seat Belts Of The Educa...
95561    What to Watch for in the FIFA Case, Part 5: Va...
80283    Shoplifter Shot And Killed After Running Over ...
94008    F1 Driver Jules Bianchi Dies 9 Months After Su...
93093    Jen Welter To Become First Female Coach In NFL...
Name: headline, dtype: object

In [31]:
y_pred[:8]

array([1, 3, 1, 3, 3, 2, 2, 1], dtype=int64)

In [32]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.81      0.83       203
           1       0.83      0.77      0.80       203
           2       0.87      0.92      0.89       203
           3       0.84      0.87      0.86       203

    accuracy                           0.84       812
   macro avg       0.84      0.84      0.84       812
weighted avg       0.84      0.84      0.84       812



In [33]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_1_3_grams', CountVectorizer(ngram_range = (1, 3))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.82      0.83       203
           1       0.83      0.77      0.80       203
           2       0.86      0.92      0.89       203
           3       0.85      0.87      0.86       203

    accuracy                           0.84       812
   macro avg       0.84      0.84      0.84       812
weighted avg       0.84      0.84      0.84       812



In [34]:
df_balanced['preprocessed_txt'] = df_balanced['headline'].apply(preprocess) 

In [35]:
df_balanced.head()

Unnamed: 0,headline,category,category_num,preprocessed_txt
15312,"Trevor Noah Calls Out Hillary Clinton, And He'...",COMEDY,0,Trevor Noah call Hillary Clinton joke
126777,What Sorcery Is This?,COMEDY,0,Sorcery
47238,"Hi, Death? I'd Like To Speak To Your Manager, ...",COMEDY,0,Hi Death like speak Manager
119663,...New Again?,COMEDY,0,New
30724,"There Was No Audio, So We Captioned The Trump ...",COMEDY,0,audio caption Trump Putin Meeting


In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_txt, 
    df_balanced.category_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023,
    stratify=df_balanced.category_num
)

In [37]:
print(X_train.shape)
X_train.head()

(3244,)


1506      prosecution Rests sex abuse Trial Ghislaine Ma...
92574      jury move close Death Penalty James Holmes Trial
47816                  Crappy Year Sports Moments bring Joy
92386         shot report 2nd Day Mississippi Military Site
113680                                             stop lie
Name: preprocessed_txt, dtype: object

In [38]:
y_train.value_counts()

category_num
2    811
1    811
3    811
0    811
Name: count, dtype: int64