In [1]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_lg")

In [4]:
import pandas as pd

In [8]:
df = pd.read_json("news_dataset.json")

In [9]:
df.shape

(12695, 2)

In [10]:
df.head()

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [11]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [12]:
df['label_num'] = df['category'].map({'BUSINESS':0, 'SPORTS':1, 'CRIME':2, 'SCIENCE':3})

In [13]:
df.head()

Unnamed: 0,text,category,label_num
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2


In [14]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [15]:
df['preprocessed_text'] = df.text.apply(preprocess)

In [16]:
df.head()

Unnamed: 0,text,category,label_num,preprocessed_text
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,watch freaky Vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge Electric Car drive high tech hig...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,civilian Guard Fires Gun protect Recruiting Ce...


In [17]:
df['vector'] = df.preprocessed_text.apply(lambda x: nlp(x).vector)

In [18]:
df.head()

Unnamed: 0,text,category,label_num,preprocessed_text,vector
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3,watch Schrödinger Cat Die University Californi...,"[-0.85190785, 1.0438694, -0.9148885, -1.395817..."
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3,watch freaky Vortex open Flooded Lake,"[0.60747343, 1.9251899, -0.16949336, -0.573053..."
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0,entrepreneur today need Big Budget start year ...,"[0.088981755, 0.5882564, -1.2281352, -0.320762..."
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0,road recharge Electric Car drive high tech hig...,"[-1.0280653, 4.349204, -1.06896, -1.045683, 1...."
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,civilian Guard Fires Gun protect Recruiting Ce...,"[-1.4220493, 0.9367255, -1.8070079, 3.1870718,..."


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df.vector.values, df.label_num,
                                                   test_size=0.2, random_state=0,
                                                   stratify=df.label_num)

In [22]:
import numpy as np

In [23]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

clf = GradientBoostingClassifier()
clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88       851
           1       0.88      0.88      0.88       833
           2       0.86      0.88      0.87       579
           3       0.84      0.76      0.79       276

    accuracy                           0.87      2539
   macro avg       0.86      0.85      0.86      2539
weighted avg       0.87      0.87      0.87      2539



In [25]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.fit_transform(X_test_2d)

clf = MultinomialNB()
clf.fit(scaled_train_embed, y_train)
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.86      0.72       851
           1       0.69      0.81      0.75       833
           2       0.93      0.62      0.75       579
           3       0.00      0.00      0.00       276

    accuracy                           0.70      2539
   macro avg       0.56      0.57      0.55      2539
weighted avg       0.65      0.70      0.66      2539



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
from sklearn.neighbors import KNeighborsClassifier

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.fit_transform(X_test_2d)

clf = KNeighborsClassifier()
clf.fit(scaled_train_embed, y_train)
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.92      0.80       851
           1       0.89      0.78      0.83       833
           2       0.80      0.83      0.82       579
           3       0.93      0.39      0.55       276

    accuracy                           0.79      2539
   macro avg       0.83      0.73      0.75      2539
weighted avg       0.81      0.79      0.79      2539



In [28]:
from sklearn.ensemble import RandomForestClassifier

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.fit_transform(X_test_2d)

clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.92      0.79       851
           1       0.82      0.81      0.82       833
           2       0.84      0.74      0.78       579
           3       0.94      0.29      0.44       276

    accuracy                           0.77      2539
   macro avg       0.82      0.69      0.71      2539
weighted avg       0.80      0.77      0.76      2539



In [29]:
from sklearn.ensemble import GradientBoostingClassifier

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.fit_transform(X_test_2d)

clf = GradientBoostingClassifier()
clf.fit(scaled_train_embed, y_train)
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.92      0.82       851
           1       0.93      0.78      0.85       833
           2       0.86      0.79      0.82       579
           3       0.77      0.71      0.74       276

    accuracy                           0.82      2539
   macro avg       0.83      0.80      0.81      2539
weighted avg       0.83      0.82      0.82      2539

