<a href="https://colab.research.google.com/github/Abhisek-Tiwari/News_Classification/blob/main/News_Category.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

In [3]:
df = pd.read_json('/content/drive/MyDrive/news_dataset.json')

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [5]:
df.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,4254
SPORTS,4167
CRIME,2893
SCIENCE,1381


In [10]:
# Using Min Sampling
min_samples = 1381

df_business = df[df.category == 'BUSINESS'].sample(min_samples, random_state=42)
df_crime = df[df.category == 'CRIME'].sample(min_samples, random_state=42)
df_sport = df[df.category == 'SPORTS'].sample(min_samples, random_state=42)
df_science = df[df.category == 'SCIENCE']

In [13]:
df_balanced = pd.concat([df_business, df_crime, df_sport, df_science])

df_balanced.category.value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,1381
CRIME,1381
SPORTS,1381
SCIENCE,1381


In [14]:
# Map Category to number
target = {'BUSINESS' : 0, 'CRIME' : 1, 'SPORTS' : 2, 'SCIENCE' : 3}

df_balanced['category_num'] = df_balanced.category.map(target)

df_balanced.head()

Unnamed: 0,text,category,category_num
594,How to Develop the Next Generation of Innovato...,BUSINESS,0
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.text,
    df_balanced.category_num,
    test_size=0.2,
    random_state=42,
    stratify = df_balanced.category_num
)

In [17]:
y_train.value_counts()

Unnamed: 0_level_0,count
category_num,Unnamed: 1_level_1
0,1105
3,1105
1,1105
2,1104


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [23]:
# Making a model dict to train
model_dict = {
    'Random Forest' : RandomForestClassifier(),
    'Naive Bayes' : MultinomialNB(),
    'Logistic Regression' : LogisticRegression(),
    'KNN' : KNeighborsClassifier()
}

In [24]:
def fit_and_score(model_dict, X_train, X_test, y_train, y_test):

  model_scores = {}
  for name, model in model_dict.items():
      clf = Pipeline([
          ('Vectorizer', CountVectorizer()),
          (f'{name}', model)
      ])
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      model_scores[name] = classification_report(y_test, y_pred)
  return model_scores

In [31]:
def fit_and_score_ngram(model_dict, X_train, X_test, y_train, y_test, gram):

  model_scores = {}
  for name, model in model_dict.items():
      clf = Pipeline([
          ('Vectorizer', CountVectorizer(ngram_range= (1,gram))),
          (f'{name}', model)
      ])
      clf.fit(X_train, y_train)
      y_pred = clf.predict(X_test)
      model_scores[name] = classification_report(y_test, y_pred)
  return model_scores

In [25]:
model_scores = fit_and_score(model_dict, X_train, X_test, y_train, y_test)

In [32]:
model_scores_ngram = fit_and_score_ngram(model_dict, X_train, X_test, y_train, y_test, 3)

In [27]:
for model, score in model_scores.items():
  print(f'{model} \n {score}')

Random Forest 
               precision    recall  f1-score   support

           0       0.70      0.83      0.76       276
           1       0.85      0.89      0.87       276
           2       0.91      0.71      0.80       277
           3       0.78      0.76      0.77       276

    accuracy                           0.80      1105
   macro avg       0.81      0.80      0.80      1105
weighted avg       0.81      0.80      0.80      1105

Naive Bayes 
               precision    recall  f1-score   support

           0       0.80      0.91      0.85       276
           1       0.88      0.92      0.90       276
           2       0.91      0.83      0.87       277
           3       0.89      0.81      0.85       276

    accuracy                           0.87      1105
   macro avg       0.87      0.87      0.87      1105
weighted avg       0.87      0.87      0.87      1105

Logistic Regression 
               precision    recall  f1-score   support

           0       0.84

In [33]:
for model, score in model_scores_ngram.items():
  print(f'{model} \n {score}')

Random Forest 
               precision    recall  f1-score   support

           0       0.72      0.81      0.76       276
           1       0.88      0.88      0.88       276
           2       0.91      0.68      0.78       277
           3       0.70      0.79      0.74       276

    accuracy                           0.79      1105
   macro avg       0.80      0.79      0.79      1105
weighted avg       0.80      0.79      0.79      1105

Naive Bayes 
               precision    recall  f1-score   support

           0       0.72      0.94      0.82       276
           1       0.89      0.90      0.89       276
           2       0.94      0.83      0.88       277
           3       0.92      0.73      0.81       276

    accuracy                           0.85      1105
   macro avg       0.87      0.85      0.85      1105
weighted avg       0.87      0.85      0.85      1105

Logistic Regression 
               precision    recall  f1-score   support

           0       0.84

In [34]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [35]:
# Now we d it after preprocessing
def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)
  return ' '.join(filtered_tokens)

In [36]:
# Preprocess the dataset
df_balanced['preprocessed_text'] = df_balanced.text.apply(preprocess)

In [37]:
df_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_text
594,How to Develop the Next Generation of Innovato...,BUSINESS,0,develop Generation Innovators stop treat way g...
3093,"Madoff Victims' Payout Nears $7.2 Billion, Tru...",BUSINESS,0,Madoff Victims Payout near $ 7.2 billion Trust...
7447,Bay Area Floats 'Sanctuary In Transit Policy' ...,BUSINESS,0,Bay Area Floats Sanctuary Transit Policy prote...
10388,Microsoft Agrees To Acquire LinkedIn For $26.2...,BUSINESS,0,Microsoft agree acquire linkedin $ 26.2 billio...
1782,"Inside A Legal, Multibillion Dollar Weed Market",BUSINESS,0,inside Legal Multibillion Dollar Weed Market


In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.preprocessed_text,
    df_balanced.category_num,
    test_size=0.2,
    random_state=42,
    stratify = df_balanced.category_num
)

In [40]:
model_scores_preprocessed = fit_and_score(model_dict, X_train, X_test, y_train, y_test)

In [44]:
model_scores_ngram_preprocessed = fit_and_score_ngram(model_dict, X_train, X_test, y_train, y_test, 3)

In [42]:
for model, score in model_scores_preprocessed.items():
  print(f'{model} \n {score}')

Random Forest 
               precision    recall  f1-score   support

           0       0.72      0.89      0.80       276
           1       0.85      0.92      0.88       276
           2       0.94      0.71      0.81       277
           3       0.83      0.76      0.80       276

    accuracy                           0.82      1105
   macro avg       0.84      0.82      0.82      1105
weighted avg       0.84      0.82      0.82      1105

Naive Bayes 
               precision    recall  f1-score   support

           0       0.86      0.89      0.87       276
           1       0.87      0.95      0.91       276
           2       0.91      0.87      0.89       277
           3       0.91      0.83      0.87       276

    accuracy                           0.88      1105
   macro avg       0.89      0.88      0.88      1105
weighted avg       0.89      0.88      0.88      1105

Logistic Regression 
               precision    recall  f1-score   support

           0       0.84

In [45]:
for model, scores in model_scores_ngram_preprocessed.items():
  print(f'{model} \n {scores}')

Random Forest 
               precision    recall  f1-score   support

           0       0.70      0.88      0.78       276
           1       0.87      0.91      0.89       276
           2       0.95      0.70      0.80       277
           3       0.82      0.77      0.79       276

    accuracy                           0.82      1105
   macro avg       0.83      0.82      0.82      1105
weighted avg       0.83      0.82      0.82      1105

Naive Bayes 
               precision    recall  f1-score   support

           0       0.84      0.89      0.87       276
           1       0.86      0.94      0.90       276
           2       0.92      0.87      0.89       277
           3       0.91      0.82      0.86       276

    accuracy                           0.88      1105
   macro avg       0.88      0.88      0.88      1105
weighted avg       0.88      0.88      0.88      1105

Logistic Regression 
               precision    recall  f1-score   support

           0       0.85

CONCLUSION: Naive Bayes is best for Text Classification