In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
v = CountVectorizer(ngram_range = (3,3))
v.fit(["Thor the god of lightining is looking for the SDE role"])

v.vocabulary_

{'thor the god': 8,
 'the god of': 6,
 'god of lightining': 1,
 'of lightining is': 5,
 'lightining is looking': 3,
 'is looking for': 2,
 'looking for the': 4,
 'for the sde': 0,
 'the sde role': 7}

In [21]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]


In [25]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preProcess(text):
    doc = nlp(text)
    
    filtered_token = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    
    return " ".join(filtered_token)


preProcess("Thor ate pizza")
preProcess("Thor is eating pizza")

'thor eat pizza'

In [28]:
corpus_processed = [preProcess(text) for text in corpus]

print(corpus_processed)

['thor eat pizza', 'Loki tall', 'Loki eat pizza']


In [30]:
v = CountVectorizer(ngram_range = (1,2))

v.fit(corpus_processed)

v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [39]:
# vector space model

# converting the text to its vector using the vocublary defined above
v.transform(["thor eat pizza"]).toarray()


array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [46]:
v.transform(["Hulk eat pizza"]).toarray()

# only "eat", "eat pizza" and "pizza" is in vocublary so only 3 1s

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

In [67]:
import pandas as pd

df = pd.read_json('news_dataset.json',lines=True)
print(df.shape)

df.head()

(209527, 6)


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [71]:
df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [73]:
min_samples = 1014

df_business = df[df.category=="BUSINESS"].sample(min_samples,random_state=2022)
df_business


Unnamed: 0,link,headline,category,short_description,authors,date
181516,https://www.huffingtonpost.com/entry/entrepren...,Entrepreneurism: Lots of Little Traumas and No...,BUSINESS,"I loved my years in corporate America, which I...","Liz Ryan, Contributor\nSpeaker, writer, sopran...",2012-11-25
58552,https://www.huffingtonpost.com/entry/tesla-fas...,Tesla Just Unveiled The Quickest Car You Can A...,BUSINESS,A new battery upgrade extends the range of the...,"Alexandria Sage, Reuters",2016-08-23
155102,https://www.huffingtonpost.com/entry/workers-p...,90 Percent Of Employers Tie Workers' Pay To Co...,BUSINESS,Caterpillar will not break any profit records ...,"Reuters, Reuters",2013-09-01
71725,https://www.huffingtonpost.com/entry/us-tax-ha...,One Of Ben Carson's Craziest Ideas Is Coming True,BUSINESS,The U.S. is the world's hottest new tax haven.,Ben Walsh,2016-03-26
80455,https://www.huffingtonpost.comhttp://www.bloom...,CEO Who Price Gouged HIV Drug Arrested For Sec...,BUSINESS,"A boyish drug company entrepreneur, who rocket...",,2015-12-17
...,...,...,...,...,...,...
98347,https://www.huffingtonpost.com/entry/rewarding...,Rewarding Brand Switchers at the Expense of Lo...,BUSINESS,Mobile service providers are doing it. Banks a...,"Ira Kalb, ContributorAssistant Professor of Cl...",2015-05-30
204261,https://www.huffingtonpost.com/entry/paul-volc...,Paul Volcker: Obama Socialist Comments Have 'N...,BUSINESS,Paul Volcker sounded off on critics of Preside...,Alana Horowitz Satlin,2012-03-25
178928,https://www.huffingtonpost.com/entry/bill-ackm...,Bill Ackman Is Right About Herbalife: It's Rip...,BUSINESS,Bill Ackman claims Herbalife is a pyramid sche...,"Janet Tavakoli, Contributor\nPresident, Tavako...",2012-12-23
93451,https://www.huffingtonpost.com/entry/mega-merg...,MEGA-MERGER: Anthem To Buy Cigna For $54 Billion,BUSINESS,NEW YORK (AP) — Anthem is buying rival Cigna f...,"Tom Murphy and Michelle Chapman, AP",2015-07-24


In [74]:
df_sports = df[df.category=="SPORTS"].sample(min_samples,random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples,random_state=2022)
df_science = df[df.category=="SCIENCE"].sample(min_samples,random_state=2022)

In [81]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science],axis = 0)
df_balanced.category.value_counts()

df_balanced['category_num'] = df_balanced.category.map(
    {
        'BUSINESS':0,
        'SPORTS' : 1,
        'CRIME' : 2,
        'SCIENCE': 3
    }
)
df_balanced


Unnamed: 0,link,headline,category,short_description,authors,date,category_num
181516,https://www.huffingtonpost.com/entry/entrepren...,Entrepreneurism: Lots of Little Traumas and No...,BUSINESS,"I loved my years in corporate America, which I...","Liz Ryan, Contributor\nSpeaker, writer, sopran...",2012-11-25,0
58552,https://www.huffingtonpost.com/entry/tesla-fas...,Tesla Just Unveiled The Quickest Car You Can A...,BUSINESS,A new battery upgrade extends the range of the...,"Alexandria Sage, Reuters",2016-08-23,0
155102,https://www.huffingtonpost.com/entry/workers-p...,90 Percent Of Employers Tie Workers' Pay To Co...,BUSINESS,Caterpillar will not break any profit records ...,"Reuters, Reuters",2013-09-01,0
71725,https://www.huffingtonpost.com/entry/us-tax-ha...,One Of Ben Carson's Craziest Ideas Is Coming True,BUSINESS,The U.S. is the world's hottest new tax haven.,Ben Walsh,2016-03-26,0
80455,https://www.huffingtonpost.comhttp://www.bloom...,CEO Who Price Gouged HIV Drug Arrested For Sec...,BUSINESS,"A boyish drug company entrepreneur, who rocket...",,2015-12-17,0
...,...,...,...,...,...,...,...
71737,https://www.huffingtonpost.com/entry/contagiou...,"7 Behaviors That Are Actually Contagious, Acco...",SCIENCE,"Yawns aren't the only things you can ""catch"" f...",Carolyn Gregoire,2016-03-26,3
63545,https://www.huffingtonpost.com/entry/climate-c...,Congress Gets Another Reminder From Scientists...,SCIENCE,"""We owe it to our children and to our children...",Kim Bellware,2016-06-28,3
127184,https://www.huffingtonpost.com/entry/alone-wit...,Here's Why People Actually Hate Having Nothing...,SCIENCE,,Macrina Cooper-White,2014-07-03,3
209428,https://www.huffingtonpost.com/entry/treating-...,Treating a World Without Antibiotics?,SCIENCE,"Because of the overuse of antibiotics, antibio...","Stanley M. Bergman, Contributor\nStanley N. Be...",2012-01-29,3


In [129]:
from sklearn.model_selection import train_test_split

# df_balanced['new_text'] = df_

df_balanced['new_text'] = df_balanced.short_description.apply(preProcess)
print(df_balanced)

X_train,X_test,y_train,y_test = train_test_split(
    df_balanced.new_text,
    df_balanced.category_num,
    test_size=0.2,
    random_state=2022,
    stratify=df_balanced.category_num
)

                                                     link   
181516  https://www.huffingtonpost.com/entry/entrepren...  \
58552   https://www.huffingtonpost.com/entry/tesla-fas...   
155102  https://www.huffingtonpost.com/entry/workers-p...   
71725   https://www.huffingtonpost.com/entry/us-tax-ha...   
80455   https://www.huffingtonpost.comhttp://www.bloom...   
...                                                   ...   
71737   https://www.huffingtonpost.com/entry/contagiou...   
63545   https://www.huffingtonpost.com/entry/climate-c...   
127184  https://www.huffingtonpost.com/entry/alone-wit...   
209428  https://www.huffingtonpost.com/entry/treating-...   
179558  https://www.huffingtonpost.com/entry/space-tra...   

                                                 headline  category   
181516  Entrepreneurism: Lots of Little Traumas and No...  BUSINESS  \
58552   Tesla Just Unveiled The Quickest Car You Can A...  BUSINESS   
155102  90 Percent Of Employers Tie Workers' Pay To Co

In [130]:
print(X_train)

68242     Los Angeles police investigator obtain method ...
117948    plan dream damn dream think adult accept plann...
119766    intriguingly people lie cheat indiscriminately...
90981     solar telescope Maui face opposition similar T...
176430    hard turn tv read editorial page day encounter...
                                ...                        
166989    California home good university large technolo...
206186    similarly electrically charge vortex gun add e...
109463                                                     
59131                             ask coach handle man team
85480                                                 point
Name: new_text, Length: 3244, dtype: object


In [131]:
y_train

68242     2
117948    0
119766    3
90981     3
176430    0
         ..
166989    0
206186    3
109463    1
59131     1
85480     1
Name: category_num, Length: 3244, dtype: int64

In [133]:
print(X_test)

72788     late Friday night Wichita State University lat...
74338          marvel maturity school district official say
113194    midst series technical glitch delay planned la...
115740    baseball writer discriminate talented player m...
41732             individual try afraid say Rabbi Gary Mazo
                                ...                        
124821                                                     
11614     Colin Kaepernick retweete picture apparent pro...
179558    future boom space tourism increasingly expose ...
38943                          death tragic accident police
75077     winter Wednesday night late 1970 Burlington Ve...
Name: new_text, Length: 812, dtype: object


In [134]:
y_test.value_counts()

category_num
2    203
3    203
1    203
0    203
Name: count, dtype: int64

In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 3))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

NameError: name 'CountVectorizer' is not defined

                                                     link   
181516  https://www.huffingtonpost.com/entry/entrepren...  \
58552   https://www.huffingtonpost.com/entry/tesla-fas...   
155102  https://www.huffingtonpost.com/entry/workers-p...   
71725   https://www.huffingtonpost.com/entry/us-tax-ha...   
80455   https://www.huffingtonpost.comhttp://www.bloom...   
...                                                   ...   
71737   https://www.huffingtonpost.com/entry/contagiou...   
63545   https://www.huffingtonpost.com/entry/climate-c...   
127184  https://www.huffingtonpost.com/entry/alone-wit...   
209428  https://www.huffingtonpost.com/entry/treating-...   
179558  https://www.huffingtonpost.com/entry/space-tra...   

                                                 headline  category   
181516  Entrepreneurism: Lots of Little Traumas and No...  BUSINESS  \
58552   Tesla Just Unveiled The Quickest Car You Can A...  BUSINESS   
155102  90 Percent Of Employers Tie Workers' Pay To Co