In [3]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(2,2))
v.fit(['Arya Stark is looking for a job'])
v.vocabulary_

{'arya stark': 0,
 'stark is': 4,
 'is looking': 2,
 'looking for': 3,
 'for job': 1}

In [4]:
corpus = [
    'Cersei ate pizza',
    'Brienne is tall',
    'Brienne is eating pizza'
]

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return " ".join(filtered_tokens)

preprocess('Brienne is eating pizza')

'Brienne eat pizza'

In [9]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['Cersei eat pizza', 'Brienne tall', 'Brienne eat pizza']

In [10]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'cersei': 3,
 'eat': 5,
 'pizza': 7,
 'cersei eat': 4,
 'eat pizza': 6,
 'brienne': 0,
 'tall': 8,
 'brienne tall': 2,
 'brienne eat': 1}

In [15]:
v.transform(['Brienne is eating pizza']).toarray()

array([[1, 0, 0, 0, 0, 0, 0, 1, 0]])

In [16]:
import pandas as pd

df = pd.read_json('news_dataset.json')
df

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME
...,...,...
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS


In [23]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [19]:
min_sample = 1381

df_business = df[df.category=='BUSINESS'].sample(min_sample, random_state=42)
df_sports = df[df.category=='SPORTS'].sample(min_sample, random_state=42)
df_crime = df[df.category=='CRIME'].sample(min_sample, random_state=42)
df_science = df[df.category=='SCIENCE'].sample(min_sample, random_state=42)

In [22]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science])
df_balanced.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [25]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced['category_num'] = df_balanced.category.map(target)

In [26]:
df_balanced

Unnamed: 0,text,category,category_num
594,How to Develop the Next Generation of Innovato...,BUSINESS,0
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0
...,...,...,...
9924,A Call for Data Literacy If we care about youn...,SCIENCE,3
10219,Here's What Happens When Someone Sneezes On An...,SCIENCE,3
11884,Most People Don't See How Climate Change Is Af...,SCIENCE,3
7854,"Watch Octopuses Meet for Blind Date, Tricky Se...",SCIENCE,3


In [27]:
X = df_balanced.text
y = df_balanced.category_num

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1, 2))),
    ('MultiNB', MultinomialNB())
])
clf.fit(X_train, y_train)

In [32]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.91      0.81       298
           1       0.93      0.79      0.85       270
           2       0.84      0.91      0.87       250
           3       0.92      0.76      0.83       287

    accuracy                           0.84      1105
   macro avg       0.86      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [33]:
df_balanced['preprocessed_text'] = df_balanced.text.apply(preprocess)

In [34]:
df_balanced

Unnamed: 0,text,category,category_num,preprocessed_text
594,How to Develop the Next Generation of Innovato...,BUSINESS,0,develop Generation Innovators stop treat way g...
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0,Madoff Victims Payout near $ 7.2 billion Trust...
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0,Bay Area Floats Sanctuary Transit Policy prote...
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0,Microsoft agree acquire linkedin $ 26.2 billio...
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0,inside Legal Multibillion Dollar Weed Market
...,...,...,...,...
9924,A Call for Data Literacy If we care about youn...,SCIENCE,3,Data Literacy care young people privacy need t...
10219,Here's What Happens When Someone Sneezes On An...,SCIENCE,3,happen sneeze Airplane
11884,Most People Don't See How Climate Change Is Af...,SCIENCE,3,People climate change affect life Problem Heid...
7854,"Watch Octopuses Meet for Blind Date, Tricky Se...",SCIENCE,3,watch Octopuses meet Blind Date tricky Sex Feb...


In [35]:
X = df_balanced.preprocessed_text
y = df_balanced.category_num

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1, 2))),
    ('MultiNB', MultinomialNB())
])
clf.fit(X_train, y_train)

In [43]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86       298
           1       0.90      0.82      0.86       270
           2       0.83      0.94      0.88       250
           3       0.91      0.84      0.87       287

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105

