# BBC News Text Classification with Neural Networks

Multi-class text classification using the BBC News dataset with 5 categories:
- Business (510 articles)
- Entertainment (386 articles)
- Politics (417 articles)
- Sport (511 articles)
- Tech (401 articles)

We will use Neural Networks (Traditional, CNN, RNN, LSTM) to train and predict.

In [None]:
# Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import metrics

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import string
import re

## Load BBC News Dataset

The BBC dataset contains 2,225 news articles in 5 categories.

**File format:** Tab-separated (\t) with columns: category, filename, title, content

In [None]:
# Upload the bbc-news-data.csv file
from google.colab import files
uploaded = files.upload()  # Upload bbc-news-data.csv

In [None]:
# Read the tab-separated CSV file
df = pd.read_csv('bbc-news-data.csv', sep='\t')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## Data Exploration

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Check class distribution
print("Class Distribution:")
print(df['category'].value_counts())
print("\nPercentage:")
print((df['category'].value_counts(normalize=True) * 100).round(2))

In [None]:
# Visualize class distribution
plt.figure(figsize=(10, 5))
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12']
df['category'].value_counts().plot(kind='bar', color=colors)
plt.title('BBC News Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Create label encoding
label_mapping = {label: idx for idx, label in enumerate(sorted(df['category'].unique()))}
reverse_mapping = {idx: label for label, idx in label_mapping.items()}

print("Label Mapping:")
for label, idx in label_mapping.items():
    print(f"  {idx}: {label}")

df['label'] = df['category'].map(label_mapping)
num_classes = len(label_mapping)
print(f"\nNumber of classes: {num_classes}")

In [None]:
# Sample text from each category
for category in df['category'].unique():
    sample = df[df['category'] == category]['content'].iloc[0][:300]
    title = df[df['category'] == category]['title'].iloc[0]
    print(f"\n{'='*60}")
    print(f"{category.upper()}")
    print(f"Title: {title}")
    print(f"{'='*60}")
    print(f"{sample}...")

## Data Preprocessing

In [None]:
# Text cleaning function
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply cleaning to content column
df['clean_text'] = df['content'].apply(clean_text)
df[['category', 'title', 'clean_text']].head()

In [None]:
# Check text length distribution
df['text_length'] = df['clean_text'].apply(lambda x: len(x.split()))

print("Text Length Statistics (words):")
print(df['text_length'].describe())

plt.figure(figsize=(10, 4))
plt.hist(df['text_length'], bins=50, edgecolor='black', alpha=0.7)
plt.title('Distribution of Text Length (words)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.axvline(df['text_length'].mean(), color='red', linestyle='--', label=f"Mean: {df['text_length'].mean():.0f}")
plt.legend()
plt.show()

## Modeling

Steps:
1. Split data into train and test (75/25)
2. Vectorize text using CountVectorizer
3. Train neural network models
4. Evaluate performance

In [None]:
# Step 1: Train-test split
X = df['clean_text']
y = df['label']

print(f"X shape: {X.shape}, y shape: {y.shape}")

# Stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Check distribution in splits
print(f"\nTraining class distribution:")
for idx in sorted(y_train.unique()):
    count = sum(y_train == idx)
    print(f"  {reverse_mapping[idx]}: {count}")

In [None]:
# Step 2-3: Vectorize text
vect = CountVectorizer(max_features=10000, preprocessor=clean_text)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

print(f"Vocabulary size: {len(vect.vocabulary_)}")
print(f"Training DTM shape: {X_train_dtm.shape}")
print(f"Test DTM shape: {X_test_dtm.shape}")

## Neural Network Setup

**GPU Setup:** Runtime > Change runtime type > T4 GPU

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import pad_sequences, to_categorical

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

In [None]:
# Pad sequences for neural network input
max_len = 5000  # Maximum sequence length

X_train_dense = pad_sequences(X_train_dtm.toarray(), maxlen=max_len, padding='post', truncating='post')
X_test_dense = pad_sequences(X_test_dtm.toarray(), maxlen=max_len, padding='post', truncating='post')

print(f"X_train_dense shape: {X_train_dense.shape}")
print(f"X_test_dense shape: {X_test_dense.shape}")

In [None]:
# Convert labels to categorical (one-hot encoding) for multi-class
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

print(f"y_train_cat shape: {y_train_cat.shape}")
print(f"y_test_cat shape: {y_test_cat.shape}")
print(f"\nSample one-hot label: {y_train_cat[0]} -> {reverse_mapping[np.argmax(y_train_cat[0])]}")

## Model 1: Classic Neural Network (Dense Layers)

In [None]:
# Define the neural network model for multi-class classification
model_dense = keras.Sequential([
    keras.layers.Embedding(input_dim=X_train_dense.shape[1], output_dim=64),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(num_classes, activation='softmax')  # Softmax for multi-class
])

# Compile with categorical crossentropy for multi-class
model_dense.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model_dense.summary()

In [None]:
# Train the model
history_dense = model_dense.fit(
    X_train_dense, y_train_cat,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_dense, y_test_cat),
    verbose=1
)

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Accuracy
axes[0].plot(history_dense.history['accuracy'], label='Train')
axes[0].plot(history_dense.history['val_accuracy'], label='Validation')
axes[0].set_title('Model Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()

# Loss
axes[1].plot(history_dense.history['loss'], label='Train')
axes[1].plot(history_dense.history['val_loss'], label='Validation')
axes[1].set_title('Model Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Evaluate Model 1
loss, accuracy = model_dense.evaluate(X_test_dense, y_test_cat)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Get predictions
predictions_dense = model_dense.predict(X_test_dense)
predicted_classes_dense = np.argmax(predictions_dense, axis=1)

print("\n" + "="*50)
print("Model 1: Dense Neural Network")
print("="*50)
print(classification_report(y_test, predicted_classes_dense, target_names=sorted(label_mapping.keys())))

In [None]:
# Confusion Matrix
import seaborn as sns

cm = confusion_matrix(y_test, predicted_classes_dense)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=sorted(label_mapping.keys()),
            yticklabels=sorted(label_mapping.keys()))
plt.title('Confusion Matrix - Dense NN')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.show()

## Model 2: CNN (Convolutional Neural Network)

In [None]:
# CNN Model for text classification
model_cnn = keras.Sequential([
    keras.layers.Embedding(input_dim=X_train_dense.shape[1], output_dim=128),
    keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.GlobalMaxPooling1D(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(num_classes, activation='softmax')
])

model_cnn.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model_cnn.summary()

In [None]:
# Train CNN
history_cnn = model_cnn.fit(
    X_train_dense, y_train_cat,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_dense, y_test_cat),
    verbose=1
)

In [None]:
# Evaluate CNN
predictions_cnn = model_cnn.predict(X_test_dense)
predicted_classes_cnn = np.argmax(predictions_cnn, axis=1)

print("="*50)
print("Model 2: CNN")
print("="*50)
print(classification_report(y_test, predicted_classes_cnn, target_names=sorted(label_mapping.keys())))

## Model 3: LSTM (Long Short-Term Memory)

In [None]:
# LSTM Model
model_lstm = keras.Sequential([
    keras.layers.Embedding(input_dim=X_train_dense.shape[1], output_dim=128),
    keras.layers.LSTM(64, return_sequences=False),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(num_classes, activation='softmax')
])

model_lstm.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model_lstm.summary()

In [None]:
# Train LSTM
history_lstm = model_lstm.fit(
    X_train_dense, y_train_cat,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_dense, y_test_cat),
    verbose=1
)

In [None]:
# Evaluate LSTM
predictions_lstm = model_lstm.predict(X_test_dense)
predicted_classes_lstm = np.argmax(predictions_lstm, axis=1)

print("="*50)
print("Model 3: LSTM")
print("="*50)
print(classification_report(y_test, predicted_classes_lstm, target_names=sorted(label_mapping.keys())))

## Model 4: Bidirectional LSTM

In [None]:
# Bidirectional LSTM Model
model_bilstm = keras.Sequential([
    keras.layers.Embedding(input_dim=X_train_dense.shape[1], output_dim=128),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(num_classes, activation='softmax')
])

model_bilstm.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model_bilstm.summary()

In [None]:
# Train Bidirectional LSTM
history_bilstm = model_bilstm.fit(
    X_train_dense, y_train_cat,
    epochs=10,
    batch_size=32,
    validation_data=(X_test_dense, y_test_cat),
    verbose=1
)

In [None]:
# Evaluate Bidirectional LSTM
predictions_bilstm = model_bilstm.predict(X_test_dense)
predicted_classes_bilstm = np.argmax(predictions_bilstm, axis=1)

print("="*50)
print("Model 4: Bidirectional LSTM")
print("="*50)
print(classification_report(y_test, predicted_classes_bilstm, target_names=sorted(label_mapping.keys())))

## Model Comparison

In [None]:
# Compare all models
models = ['Dense NN', 'CNN', 'LSTM', 'Bi-LSTM']
predictions_all = [predicted_classes_dense, predicted_classes_cnn, predicted_classes_lstm, predicted_classes_bilstm]

accuracies = [accuracy_score(y_test, pred) for pred in predictions_all]

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies
}).sort_values('Accuracy', ascending=False)

print("Model Comparison:")
print(comparison_df.to_string(index=False))

# Visualization
plt.figure(figsize=(10, 5))
colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6']
bars = plt.bar(models, accuracies, color=colors)
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc:.3f}', ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.show()

## Key Takeaways

| Model | Best For |
|-------|----------|
| **Dense NN** | Fast training, baseline performance |
| **CNN** | Capturing local patterns in text |
| **LSTM** | Sequential dependencies |
| **Bi-LSTM** | Context from both directions |

### Multi-class Classification Notes:
- Use `softmax` activation (not sigmoid) for output layer
- Use `categorical_crossentropy` loss (not binary)
- Convert labels to one-hot encoding with `to_categorical()`
- Use `np.argmax()` to get predicted classes from probabilities
- Evaluate with macro F1 for balanced assessment across classes

### BBC Dataset Summary:
- **Total articles:** 2,225
- **Categories:** business, entertainment, politics, sport, tech
- **File format:** Tab-separated CSV with columns: category, filename, title, content