# Multi-class News Classification Notebook

This notebook trains a multi-class text classification model on `news_balanced_categories.csv`.
It uses the `headline` and `short_description` columns combined as input, and `category` as the label.

**Features included**
- Data loading & inspection
- Basic text cleaning
- Combining `headline` + `short_description`
- Train / validation split
- Tokenization and padding
- Label encoding
- Keras model (Embedding + Bidirectional LSTM)
- Training, evaluation, and plotting metrics
- Sample predictions

Run this notebook cell-by-cell. Adjust hyperparameters (vocab_size, max_len, embedding_dim, epochs) for better results.


In [None]:
# Imports and load dataset
import pandas as pd
import numpy as np
import os

file_path = "/mnt/data/news_balanced_categories.csv"
df = pd.read_csv(file_path)

# inspect
df.head()


In [None]:
# Columns, null counts and class distribution
print("Columns:", df.columns.tolist())
print("\nNull counts:")
print(df.isnull().sum())
print("\nClass distribution:")
print(df['category'].value_counts())


In [None]:
# Prepare text input (combine headline and short_description) and labels
df['short_description'] = df['short_description'].fillna('')
df['text'] = (df['headline'].astype(str) + " . " + df['short_description'].astype(str)).str.strip()

# Drop rows with empty text or missing category
df = df[df['text'].str.strip() != '']
df = df.dropna(subset=['category'])

print("Prepared dataset shape:", df.shape)
df[['text','category']].head()


In [None]:
# Train / validation split and label encoding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X = df['text'].values
y = df['category'].values

label_encoder = LabelEncoder()
y_enc = label_encoder.fit_transform(y)
class_names = label_encoder.classes_.tolist()
print("Classes ({}):".format(len(class_names)), class_names)

X_train, X_val, y_train, y_val = train_test_split(X, y_enc, test_size=0.2, stratify=y_enc, random_state=42)

print("Train size:", len(X_train), "Validation size:", len(X_val))


In [None]:
# Tokenization and sequence padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 20000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

max_len = 120

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')

print("Vocabulary size (used):", min(vocab_size, len(tokenizer.word_index)+1))
print("Example sequence length:", len(X_train_seq[0]), "padded shape:", X_train_pad.shape)


In [None]:
# Build Keras model (Embedding + Bidirectional LSTM)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

embedding_dim = 100
num_classes = len(class_names)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [None]:
# Train the model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

batch_size = 64
epochs = 6

callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
    ModelCheckpoint("best_model.h5", save_best_only=True, monitor='val_loss')
]

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=epochs,
    batch_size=batch_size,
    callbacks=callbacks
)


In [None]:
# Plot training history (accuracy and loss)
import matplotlib.pyplot as plt

# Accuracy
plt.figure()
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Loss
plt.figure()
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Evaluate and show classification report
from sklearn.metrics import classification_report, accuracy_score

y_pred_probs = model.predict(X_val_pad)
y_pred = y_pred_probs.argmax(axis=1)

print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=class_names))


In [None]:
# Save tokenizer and label encoder for later use
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Saved tokenizer.pkl and label_encoder.pkl")


In [None]:
# Sample prediction function
def predict_text(texts, top_k=1):
    seq = tokenizer.texts_to_sequences(texts)
    pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    probs = model.predict(pad)
    preds = probs.argmax(axis=1)
    labels = label_encoder.inverse_transform(preds)
    return list(zip(texts, labels, probs.max(axis=1)))

# Try on a few validation texts
samples = list(X_val[:8])
preds = predict_text(samples)
for txt, lbl, conf in preds:
    print(f"Label: {lbl} (conf={conf:.3f})\nText: {txt[:200]}\n---\n")


1. If you have imbalanced classes, consider class weights or oversampling.
2. Try experimenting with `vocab_size`, `max_len`, `embedding_dim`, and model architecture (CNN, transformers).
3. For better performance, try transfer learning with pretrained embeddings (GloVe) or transformer models (BERT).
4. If GPU is available, increase batch_size and epochs for more stable training.
