# Predicting the Category of a News Article from its Headline & Description

## Imports

In [None]:
# Classic imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Huggingface
import transformers

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Embedding, Flatten, Dense, Dropout
from keras.layers import Conv1D, SimpleRNN, Bidirectional, MaxPooling1D, GlobalMaxPool1D, LSTM, GRU
from keras.models import Sequential
from keras.regularizers import L1L2

# Sklearn
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

# More NLP imports
import nltk
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Other
import os
import re

## Loading Data, Exploratory Data Analysis (EDA), and Data Preprocessing

In [None]:
df = pd.read_json('News_Category_Dataset_v3.json', lines = True)
df.head()

In [None]:
df.shape

In [None]:
# Merge duplicate category
df['category'] = df['category'].map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

# One-hot Encoding Category
encoder = LabelEncoder()
df['category_encoded'] = encoder.fit_transform(df['category'])

# Convert to lowercase
df['headline'] = df['headline'].apply(lambda x: str(x).lower())
df['short_description'] = df['short_description'].apply(lambda x: str(x).lower())

# Get number of words (length) for EDA
df['len_h'] = df['headline'].apply(lambda x: len(str(x).split()))
df['len_sd'] = df['short_description'].apply(lambda x: len(str(x).split()))

In [None]:
df.describe() # Full stats

In [None]:
# For stats figure
df.describe()[['len_d', 'len_h']].rename(columns={'len_d':'Description Length', 'len_h':'Headline Length'}).loc[['count', 'mean', 'std', 'min', '50%', 'max']]

In [None]:
# Histogram for EDA figures
ax = sns.histplot(df['len_h'], bins=45)
ax.set_xlabel('Number of Words in Headline Text')
plt.xlim(-2, 32)

In [None]:
# Histogram
ax = sns.histplot(df['descr_len'], bins=80)
ax.set_xlabel('Number of Words in Short Description')
plt.xlim(-5, 100)

In [None]:
# Top categories figure
categories = pd.DataFrame(df['category'].value_counts()).reset_index().rename(columns={'index':'categories','category':'n_c'})

# Visualize top 10 categories and proportion of each categories in dataset
top_n = 10
plt.figure(figsize=(10,6))
ax = sns.barplot(np.array(cat_df.news_classes)[:top_n], np.array(cat_df.numcat)[:top_n])
plt.xlabel("News Categories")
plt.xticks(rotation=45) # For better visibility
plt.ylabel("Count per Category")
plt.show()

In [None]:
# Total words
df['len_d'].sum() + df['len_h'].sum()

In [None]:
# Desc. words
df['len_d'].sum()

In [None]:
# Headline words
df['len_h'].sum()

In [None]:
# Word Cloud Figure
all_headline_text = ' '.join(df['headline'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_headline_text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# For convenience, merge both text columns
# Also deals with missing values in the description column (there's always a headline though)
df['short_description'] = df['headline'] + df['short_description']

In [None]:
# Dataset split
seed = 77 # Lucky number

# Split into train (80) and temp (20)
X_train, X_temp, y_train, y_temp = train_test_split(df['short_description'], df['category_encoded'], random_state = seed, test_size = 0.2)

# Split temp into val (0.5 * 20 = 10) and test (0.5 * 20 = 10)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state = seed, test_size = 0.5)

In [None]:
# Get tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize Features and One-hot Encode Targets

def _tokenize(text, tokenizer, max_length=512):
    batch_encoded = tokenizer.batch_encode_plus(
        texts,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=max_length,
        truncation=True
    )
    return np.array(batch_encoded['input_ids'])

max_length = 80
X_train_encoded = _tokenize(X_train.astype('str'), tokenizer, max_length=max_length)
y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=41,dtype = 'int32')
X_val_encoded = _tokenize(X_test.astype('str'), tokenizer, max_length=max_length)
y_val_encoded = tf.keras.utils.to_categorical(y_test, num_classes=41,dtype = 'int32')
X_test_encoded = _tokenize(X_test.astype('str'), tokenizer, max_length=max_length)
y_test_encoded = tf.keras.utils.to_categorical(y_test, num_classes=41,dtype = 'int32')

## Models - BERT

In [None]:
### Reference code: https://huggingface.co/google-bert/bert-base-uncased

from transformers import BertTokenizer, TFBertModel


# BERT pre-trained model
bert_layer = TFBertModel.from_pretrained("bert-base-uncased")


loss='categorical_crossentropy'
max_length=80
num_classes=41
drop_rate=0.2
learning_rate=1e-3

input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
output_seq = bert_layer(input_ids)[0] # Output from BERT

cls = sequence_output[:, 0, :]
x = tf.keras.layers.Dropout(drop_rate)(cls)
outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)

model = tf.keras.Model(inputs=input_ids, outputs=outputs)
model.compile(tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=loss,
              metrics=['accuracy'])

In [None]:
# Datasets
batch_size = 32
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train_encoded, y_train_encoded))
    .repeat()
    .shuffle(2048)
    .batch(batch_size)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_val_encoded, y_val_encoded)
    .batch(batch_size)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test_encoded, y_test_encoded)
    .batch(batch_size)
)

In [None]:
# Training
num_epochs = 10
n_steps = X_train_encoded.shape[0] // batch_size
model.fit(train_dataset, steps_per_epoch=n_steps, epochs=num_epochs, validation_data=val_dataset)

In [None]:
# Evaluation
preds = model.predict(test_dataset, verbose=1)
pred_classes = np.argmax(preds, axis=1)
id_to_category = encoder.classes_
y_test_pred = [id_to_category[p] for p in pred_classes]
y_test_true = [id_to_category[t] for t in y_test]
acc = sklearn.metrics.accuracy_score(y_test_true, y_test_pred)
acc

## Models - DistilBERT

Similar to BERT

In [None]:
## REFERENCE CODE: https://huggingface.co/distilbert/distilbert-base-uncased

# DistilBERT pre-trained model
from transformers import DistilBertTokenizer, TFDistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_layer = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

loss='categorical_crossentropy'
max_length=80
num_classes=41
drop_rate=0.2
learning_rate=1e-3

input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name="input_ids")
output_seq = bert_layer(input_ids)[0] # Output from BERT

cls = sequence_output[:, 0, :]
x = tf.keras.layers.Dropout(drop_rate)(cls)
outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)

model = tf.keras.Model(inputs=input_ids, outputs=outputs)
model.compile(tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss=loss,
              metrics=['accuracy'])

# Training
num_epochs = 10
n_steps = X_train_encoded.shape[0] // batch_size
model.fit(train_dataset, steps_per_epoch=n_steps, epochs=num_epochs, validation_data=val_dataset)

# Evaluation
preds = model.predict(test_dataset, verbose=1)
pred_classes = np.argmax(preds, axis=1)
id_to_category = encoder.classes_
y_test_pred = [id_to_category[p] for p in pred_classes]
y_test_true = [id_to_category[t] for t in y_test]
acc = sklearn.metrics.accuracy_score(y_test_true, y_test_pred)
acc

## Data Pre-Processing specifically for Baselines

In [None]:
df = pd.read_json('News_Category_Dataset_v3.json', lines = True)
df.head()

In [None]:
df.shape

In [None]:
# Merge duplicate category
df['category'] = df['category'].map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

# One-hot Encoding Category
encoder = LabelEncoder()
df['category_encoded'] = encoder.fit_transform(df['category'])

# Convert to lowercase
df['headline'] = df['headline'].apply(lambda x: str(x).lower())
df['short_description'] = df['short_description'].apply(lambda x: str(x).lower())

# Drop duplicates
df.drop_duplicates(keep='last', inplace=True)

# Like before, merge short description and headline
df['short_description'] = df['headline'].astype(str) + df['short_description'].astype(str)
df.drop(columns =['headline','short_description'],axis=1, inplace=True)
df.astype(str)
df.head()

In [None]:
# Dataset split
seed = 77 # Lucky number

# Split into train (80) and temp (20)
X_train, X_temp, y_train, y_temp = train_test_split(df['short_description'], df['category'], random_state = seed, test_size = 0.2)

# Split temp into val (0.5 * 20 = 10) and test (0.5 * 20 = 10)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state = seed, test_size = 0.5)

In [None]:
# Padding and Tokenization

vocab_size = 200000
max_length = 100
trunc_type='post'
padding_type='post'
oov = "<OOV>"

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size, oov_token=oov)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen= max_length,padding=padding_type, truncating=trunc_type)

y_train = np.asarray(y_train)
y_train = pd.get_dummies(y_train)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,maxlen= max_length,padding=padding_type, truncating=trunc_type)

y_val = np.asarray(y_val)
y_val = pd.get_dummies(y_val)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_val = np.array(X_val)
y_val = np.array(y_val)

y_test = pd.get_dummies(y_test)
y_test = np.asarray(y_test)
y_test = np.argmax(y_test,axis=1)

In [None]:
# Get GloVe embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
# Setting up GloVe embeddings
glove_path =  './glove.6B.100d.txt'
num_tokens = len(tokenizer.word_index.items()) + 2
embed_dim = 100
embeddings_index = {}

with open(glove_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

embedding_matrix = np.zeros((num_tokens, embed_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Baselines - LSTM

In [None]:
num_neurons = 128
embed_dim = 100
drop_rate = 0.2
learning_rate = 1e-3
num_epochs = 20
batch_size = 32
num_classes = 41
model = keras.models.Sequential([
    Embedding(num_tokens,
              embed_dim,
              embeddings_initializer=keras.initializers.Constant(embedding_matrix),
              mask_zero=True,
              input_shape=[None],
              trainable=False),
    keras.layers.Bidirectional(keras.layers.LSTM(num_neurons, dropout=drop_rate)),
    keras.layers.Dense(num_classes, activation="softmax")
])

# Training
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X_train,
                    y_train,
                    batch_size=batch_size,
                    steps_per_epoch=len(X_train) // batch_size,
                    validation_data = (X_val, y_val),
                    validation_steps = len(X_val) // batch_size,
                    epochs=num_epochs)

# Evaluation
categories = dataset['category'].value_counts().index

def prediction(inference_data):
    X = tokenizer.texts_to_sequences(inference_data)
    X = pad_sequences(X,maxlen= max_length,padding=padding_type, truncating=trunc_type)
    pred = model.predict(X)
    pred_value = tf.argmax(pred,axis =1).numpy()
    return pred_value

y_test_pred = prediction(X_test)
acc = sklearn.metrics.accuracy_score(y_test_true, y_test_pred)
acc

## Baselines - RNN

In [None]:
vocab_size = 200000
embed_dim = 64
num_neurons = 64
drop_rate = 0.2
num_classes = 41
batch_size = 32
num_epochs = 20

model = keras.models.Sequential([
    Embedding(num_tokens,
              embed_dim,
              embeddings_initializer=keras.initializers.Constant(embedding_matrix),
              mask_zero=True,
              input_shape=[None],
              trainable=False),
    Bidirectional(SimpleRNN(num_neurons, dropout=drop_rate, recurrent_dropout=drop_rate, activation='tanh', return_sequences=True)),
    Bidirectional(SimpleRNN(num_neurons, dropout=drop_rate, recurrent_dropout=drop_rate, activation='tanh', return_sequences=True)),
    SimpleRNN(num_neurons // 2, activation='tanh'),
    Dropout(drop_rate),
    Dense(num_classes, activation='softmax')
])

# Training
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(X_train,
                    y_train,
                    batch_size=batch_size,
                    steps_per_epoch=len(X_train) // batch_size,
                    validation_data = (X_val, y_val),
                    validation_steps = len(X_val) // batch_size,
                    epochs=num_epochs)

# Evaluation
categories = dataset['category'].value_counts().index

def prediction(inference_data):
    X = tokenizer.texts_to_sequences(inference_data)
    X = pad_sequences(X,maxlen= max_length,padding=padding_type, truncating=trunc_type)
    pred = model.predict(X)
    pred_value = tf.argmax(pred,axis =1).numpy()
    return pred_value

y_test_pred = prediction(X_test)
acc = sklearn.metrics.accuracy_score(y_test_true, y_test_pred)
acc