In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
v = CountVectorizer()
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 5, 'hathodawala': 1, 'is': 2, 'looking': 4, 'for': 0, 'job': 3}

In [9]:
v = CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [10]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [12]:
import spacy

#* load english language model and create nlp object from it 
nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    doc =nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        
        filtered_tokens.append(token.lemma_)
        
        
    return " ".join(filtered_tokens)

preprocess("Thor ate pizzas")

'thor eat pizza'

In [13]:
corpus_processed = [preprocess(text) for text in corpus ]

corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [18]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [19]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]], dtype=int64)

In [20]:
v.transform(['Hulk eat pizza']).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [21]:
import pandas as pd 

In [23]:
df = pd.read_json('News_Category_Dataset_v3.json',lines=True)

In [24]:
print(df.shape)
df.head(3)

(209527, 6)


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23


In [29]:
df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [33]:
min_samples = 1381

df_business = df[df.category == "BUSINESS"].sample(min_samples,random_state=2022)
df_sports = df[df.category == "SPORTS"].sample(min_samples,random_state=2022)
df_crime = df[df.category == "CRIME"].sample(min_samples,random_state=2022)
df_science = df[df.category == "SCIENCE"].sample(min_samples,random_state=2022)

In [38]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science])

df_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [40]:
target = {'BUSINESS':0,'SPORTS':1,'CRIME':2,'SCIENCE':3}
df_balanced['category_number'] = df_balanced.category.map(target)

In [43]:
df_balanced.head(2)

Unnamed: 0,link,headline,category,short_description,authors,date,category_number
181516,https://www.huffingtonpost.com/entry/entrepren...,Entrepreneurism: Lots of Little Traumas and No...,BUSINESS,"I loved my years in corporate America, which I...","Liz Ryan, Contributor\nSpeaker, writer, sopran...",2012-11-25,0
58552,https://www.huffingtonpost.com/entry/tesla-fas...,Tesla Just Unveiled The Quickest Car You Can A...,BUSINESS,A new battery upgrade extends the range of the...,"Alexandria Sage, Reuters",2016-08-23,0


In [47]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df_balanced.short_description,
                                                 df_balanced.category_number,
                                                 test_size=0.2,
                                                 random_state=2022,
                                                 stratify=df_balanced.category_number)

In [48]:
print(X_train.shape)

(4419,)


In [49]:
X_train.head()

196434    The X-37B spacecraft touched down at Californi...
178136    "When we stepped into our first cave, Yangzi c...
102479    O'Connor held the puck in his glove momentaril...
37965     Joseph Jakubowski, 32, was taken into custody ...
191964    42-year-old teacher Derek McGlone was well kno...
Name: short_description, dtype: object

In [51]:
y_train.value_counts()

category_number
3    1105
2    1105
0    1105
1    1104
Name: count, dtype: int64

In [52]:
X_train.value_counts()

short_description
                                                                                                                                                                                                                                                                                                                                                                                      746
Follow Mike Wall on Twitter @michaeldwall and Google+. Follow us @Spacedotcom, Facebook or Google+. Originally published                                                                                                                                                                                                                                                                3
The X-37B spacecraft touched down at California's Vandenberg Air Force Base at 5:48 a.m. local time Saturday (8:48 a.m. EDT                                                                                                       

In [53]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [55]:
classifier = Pipeline([
    ('vectorizer_bow',CountVectorizer()),
    ("Multi NB", MultinomialNB())
])

classifier.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.46      0.89      0.60       276
           1       0.83      0.50      0.62       277
           2       0.78      0.59      0.67       276
           3       0.77      0.54      0.63       276

    accuracy                           0.63      1105
   macro avg       0.71      0.63      0.63      1105
weighted avg       0.71      0.63      0.63      1105



In [57]:
classifier.fit(X_train,y_train)

In [58]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.46      0.89      0.60       276
           1       0.83      0.50      0.62       277
           2       0.78      0.59      0.67       276
           3       0.77      0.54      0.63       276

    accuracy                           0.63      1105
   macro avg       0.71      0.63      0.63      1105
weighted avg       0.71      0.63      0.63      1105



In [68]:
classifier = Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,1))),
    ("Multi NB", MultinomialNB())
])

classifier.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.46      0.89      0.60       276
           1       0.83      0.50      0.62       277
           2       0.78      0.59      0.67       276
           3       0.77      0.54      0.63       276

    accuracy                           0.63      1105
   macro avg       0.71      0.63      0.63      1105
weighted avg       0.71      0.63      0.63      1105



In [69]:
X_test[:5]

79832     Predominantly African American congregations m...
106908                                                     
166971    As I celebrate my birthday, I'm cognizant of a...
140653    Recently I was talking to a new friend about s...
111157                                                     
Name: short_description, dtype: object

In [70]:
y_test[:5]

79832     0
106908    3
166971    3
140653    0
111157    2
Name: category_number, dtype: int64

In [71]:
y_pred[:5]

array([3, 0, 0, 0, 0], dtype=int64)

In [72]:
df_balanced['preprocessed_text'] = df_balanced.short_description.apply(preprocess)

In [73]:
df_balanced.head(3)

Unnamed: 0,link,headline,category,short_description,authors,date,category_number,preprocessed_text
181516,https://www.huffingtonpost.com/entry/entrepren...,Entrepreneurism: Lots of Little Traumas and No...,BUSINESS,"I loved my years in corporate America, which I...","Liz Ryan, Contributor\nSpeaker, writer, sopran...",2012-11-25,0,love year corporate America view preparation r...
58552,https://www.huffingtonpost.com/entry/tesla-fas...,Tesla Just Unveiled The Quickest Car You Can A...,BUSINESS,A new battery upgrade extends the range of the...,"Alexandria Sage, Reuters",2016-08-23,0,new battery upgrade extend range car
155102,https://www.huffingtonpost.com/entry/workers-p...,90 Percent Of Employers Tie Workers' Pay To Co...,BUSINESS,Caterpillar will not break any profit records ...,"Reuters, Reuters",2013-09-01,0,Caterpillar break profit record 2013 cancellat...


In [74]:

X_train,X_test,y_train,y_test = train_test_split(df_balanced.preprocessed_text,
                                                 df_balanced.category_number,
                                                 test_size=0.2,
                                                 random_state=2022,
                                                 stratify=df_balanced.category_number)


classifier = Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,1))),
    ("Multi NB", MultinomialNB())
])

classifier.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.51      0.89      0.65       276
           1       0.82      0.55      0.66       277
           2       0.77      0.64      0.70       276
           3       0.78      0.59      0.67       276

    accuracy                           0.67      1105
   macro avg       0.72      0.67      0.67      1105
weighted avg       0.72      0.67      0.67      1105

