In [42]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(2, 2))
v.fit(["I like to eat chicken for dinner"])

In [43]:
v.vocabulary_

{'like to': 3,
 'to eat': 4,
 'eat chicken': 1,
 'chicken for': 0,
 'for dinner': 2}

In [44]:
v = CountVectorizer(ngram_range=(1, 3))
v.fit(["I like to eat chicken for dinner"])

In [45]:
v.vocabulary_

{'like': 9,
 'to': 12,
 'eat': 4,
 'chicken': 0,
 'for': 7,
 'dinner': 3,
 'like to': 10,
 'to eat': 13,
 'eat chicken': 5,
 'chicken for': 1,
 'for dinner': 8,
 'like to eat': 11,
 'to eat chicken': 14,
 'eat chicken for': 6,
 'chicken for dinner': 2}

In [46]:
corpus = [
    "I like cheese",
    "Don't trust the cake",
    "He is living there"
]

In [47]:
import spacy

In [48]:
nlp = spacy.load("en_core_web_sm")

In [49]:
def preprocess(text):
    doc = nlp(text)

    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)
    

In [50]:
preprocess("Don't trust the cake")

'trust cake'

In [51]:
corpus_processed = [preprocess(text) for text in corpus]

In [52]:
corpus_processed

['like cheese', 'trust cake', 'live']

In [53]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)

In [54]:
v.vocabulary_

{'like': 2,
 'cheese': 1,
 'like cheese': 3,
 'trust': 5,
 'cake': 0,
 'trust cake': 6,
 'live': 4}

In [56]:
v.transform(["The cake is a lie"]).toarray()

array([[1, 0, 0, 0, 0, 0, 0]])

In [57]:
import pandas as pd 

df = pd.read_json("news_dataset.json")

In [59]:
df.shape

(12695, 2)

In [60]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [61]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [66]:
min_samples = 1381

df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=0)
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=0)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=0)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=0)

In [67]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis=0)
df_balanced.category.value_counts()

category
BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: count, dtype: int64

In [70]:
target = {"BUSINESS": 0, "SPORTS":1, "CRIME":2, "SCIENCE":3}

In [71]:
df_balanced['category_num'] = df_balanced.category.map(target)

In [72]:
df_balanced.head()

Unnamed: 0,text,category,category_num
5790,Recycling Opens the Door to a Circular Economy...,BUSINESS,0
6168,Beyond Silicon Valley: Using a MOOC to Build a...,BUSINESS,0
2965,Obstacles for Women in Business: The Comfort P...,BUSINESS,0
9320,Tesla Investigating Second Reported Crash In A...,BUSINESS,0
3743,The Places That Most Desperately Need A Higher...,BUSINESS,0


In [68]:
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced.text, df_balanced.category_num,
                                                    test_size=0.2, random_state=0,
                                                    stratify=df_balanced.category_num)

In [80]:
y_test.value_counts()

category_num
0    277
3    276
1    276
2    276
Name: count, dtype: int64

In [83]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [84]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer()),
    ('Multi_NB', MultinomialNB())
])

clf.fit(X_train, y_train)

In [85]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       277
           1       0.90      0.83      0.86       276
           2       0.88      0.90      0.89       276
           3       0.86      0.82      0.84       276

    accuracy                           0.86      1105
   macro avg       0.86      0.86      0.86      1105
weighted avg       0.86      0.86      0.86      1105



In [87]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
    ('Multi_NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.91      0.80       277
           1       0.92      0.78      0.84       276
           2       0.89      0.90      0.89       276
           3       0.87      0.76      0.81       276

    accuracy                           0.84      1105
   macro avg       0.85      0.84      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [88]:
df_balanced['preprocessed_txt'] = df_balanced.text.apply(preprocess)

In [89]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced.preprocessed_txt, df_balanced.category_num,
                                                    test_size=0.2, random_state=0,
                                                    stratify=df_balanced.category_num)

In [90]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range=(1,2))),
    ('Multi_NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       277
           1       0.92      0.87      0.89       276
           2       0.86      0.93      0.90       276
           3       0.90      0.83      0.86       276

    accuracy                           0.87      1105
   macro avg       0.88      0.87      0.87      1105
weighted avg       0.88      0.87      0.87      1105

