In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE 

In [2]:
df = pd.read_json("news_dataset.json")

In [3]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [4]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [5]:
df['category_nuber'] = df.category.map({
    'BUSINESS':0,
    'SPORTS':1,
    'CRIME':2,
    'SCIENCE':3
})

In [6]:
df.head()

Unnamed: 0,text,category,category_nuber
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2


In [7]:
df.category_nuber.value_counts()

category_nuber
0    4254
1    4167
2    2893
3    1381
Name: count, dtype: int64

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()


In [64]:
df.head()

Unnamed: 0,text,category,category_nuber,preprocessed_text
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,WATCH Freaky vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge electric Car drive high tech hig...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,Civilian Guard Fires Gun protect Recruiting Ce...


In [65]:
x = df.text
y = df.category_nuber

In [66]:
vectorizer.fit(x.values.ravel())


In [67]:
xv = vectorizer.transform(x.values.ravel())
xv = xv.toarray()


In [68]:
x_resample, y_resampled = SMOTE().fit_resample(xv, y)

In [69]:
x_resample.shape

(17016, 22661)

In [70]:
y_resampled.shape

(17016,)

In [71]:
y_resampled.value_counts()

category_nuber
3    4254
0    4254
2    4254
1    4254
Name: count, dtype: int64

In [72]:
from sklearn.model_selection import train_test_split

In [73]:
x_train, x_test, y_train, y_test = train_test_split(x_resample,y_resampled,random_state=42, stratify=y_resampled)

In [74]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [75]:
clf = Pipeline([
    ('Multi NB',MultinomialNB())
])#Pipeline is arranged as list and insie the bracket we can give name for it and the function it has to do

In [76]:
clf.fit(x_train,y_train)

In [77]:
y_pred = clf.predict(x_test)

In [78]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.74      0.93      0.83      1064
           1       0.80      0.89      0.84      1063
           2       0.90      0.83      0.86      1064
           3       0.92      0.66      0.77      1063

    accuracy                           0.83      4254
   macro avg       0.84      0.83      0.83      4254
weighted avg       0.84      0.83      0.83      4254



In [79]:
import spacy

In [80]:
nlp = spacy.load("en_core_web_sm")

In [81]:
def preprocess(text):
    doc = nlp(text)
    pre_text = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        pre_text.append(token.lemma_)
    return " ".join(pre_text)


In [82]:
df["preprocessed_text"] = df.text.apply(preprocess)

In [83]:
df.head()

Unnamed: 0,text,category,category_nuber,preprocessed_text
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,WATCH Freaky vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge electric Car drive high tech hig...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,Civilian Guard Fires Gun protect Recruiting Ce...
