# News Category Classification

## Importing required datasets

In [17]:
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords

## Getting dataset

In [45]:
df = pd.read_json("data/proj79/News_Category_Dataset_v3.json", lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
2,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
3,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
4,https://www.huffpost.com/entry/belk-worker-fou...,Cleaner Was Dead In Belk Bathroom For 4 Days B...,U.S. NEWS,The 63-year-old woman was seen working at the ...,,2022-09-22


## Cleaning dataset

In [46]:
df.isna().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

## Pre-Processing dataset

In [47]:
def preprocess_text(text):
    text = text.lower()
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

headline = []
for head in df["headline"]:
    preprocessed_head = preprocess_text(head)
    headline.append(preprocessed_head)

df["headline"] = headline

short_desc = []
for desc in df["short_description"]:
    preprocessed = preprocess_text(desc)
    short_desc.append(preprocessed)

df["short_description"] = short_desc

In [48]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,4 million americans roll sleeves omicrontarget...,U.S. NEWS,health experts said early predict whether dema...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/funniest-tweets...,23 funniest tweets cats dogs week sept 1723,COMEDY,dog dont understand could eaten,Elyse Wanshel,2022-09-23
2,https://www.huffpost.com/entry/funniest-parent...,funniest tweets parents week sept 1723,PARENTING,accidentally put grownup toothpaste toddler’s ...,Caroline Bologna,2022-09-23
3,https://www.huffpost.com/entry/amy-cooper-lose...,woman called cops black birdwatcher loses laws...,U.S. NEWS,amy cooper accused investment firm franklin te...,Nina Golgowski,2022-09-22
4,https://www.huffpost.com/entry/belk-worker-fou...,cleaner dead belk bathroom 4 days body found p...,U.S. NEWS,63yearold woman seen working south carolina st...,,2022-09-22


In [49]:
df.drop(["link", "authors", "date"], axis=1, inplace=True)
df.head()

Unnamed: 0,headline,category,short_description
0,4 million americans roll sleeves omicrontarget...,U.S. NEWS,health experts said early predict whether dema...
1,23 funniest tweets cats dogs week sept 1723,COMEDY,dog dont understand could eaten
2,funniest tweets parents week sept 1723,PARENTING,accidentally put grownup toothpaste toddler’s ...
3,woman called cops black birdwatcher loses laws...,U.S. NEWS,amy cooper accused investment firm franklin te...
4,cleaner dead belk bathroom 4 days body found p...,U.S. NEWS,63yearold woman seen working south carolina st...


In [50]:
X = df.drop("category", axis=1)
y = df["category"]

le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

array(['ARTS', 'ARTS & CULTURE', 'BLACK VOICES', 'BUSINESS', 'COLLEGE',
       'COMEDY', 'CRIME', 'CULTURE & ARTS', 'DIVORCE', 'EDUCATION',
       'ENTERTAINMENT', 'ENVIRONMENT', 'FIFTY', 'FOOD & DRINK',
       'GOOD NEWS', 'GREEN', 'HEALTHY LIVING', 'HOME & LIVING', 'IMPACT',
       'LATINO VOICES', 'MEDIA', 'MONEY', 'PARENTING', 'PARENTS',
       'POLITICS', 'QUEER VOICES', 'RELIGION', 'SCIENCE', 'SPORTS',
       'STYLE', 'STYLE & BEAUTY', 'TASTE', 'TECH', 'THE WORLDPOST',
       'TRAVEL', 'U.S. NEWS', 'WEDDINGS', 'WEIRD NEWS', 'WELLNESS',
       'WOMEN', 'WORLD NEWS', 'WORLDPOST'], dtype=object)

In [51]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [52]:
cv = CountVectorizer()


In [53]:
X_train = cv.fit_transform(X_train['headline'] + X_train['short_description'])
X_test = cv.transform(X_test['headline'] + X_test['short_description'])

## Training the model

In [55]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [56]:
params = {
    'max_depth': 6,
    'eta': 0.1,
    'objective': 'multi:softmax',
    'num_class': len(np.unique(y_train)),
    'eval_metric': 'merror'
}

In [57]:
model = xgb.train(params, dtrain, num_boost_round=100)


In [58]:
y_pred = model.predict(dtest)

In [59]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.49441607407053884

## Prediction

In [96]:
text = "The war between russia and ukraine was never stopping"

text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
words = text.split()
stop_words = set(stopwords.words("english"))
words = [word for word in words if word not in stop_words]
text = ' '.join(words)

text = cv.transform([text])

In [97]:
text = xgb.DMatrix(text)

In [98]:
model.predict(text)

array([41.], dtype=float32)

In [100]:
le.classes_[41]

'WORLDPOST'