# 📰 BBC News Text Classification with BiLSTM

This notebook builds a text classification model to predict the category of BBC news articles using deep learning.  
Key steps include:
- Data preprocessing
- Text cleaning and stemming
- GloVe word embedding
- Tokenization and padding
- BiLSTM model training
- Evaluation and prediction

In [1]:
import numpy as np
import pandas as pd
import re

## 📥 Load Dataset

In [2]:
train = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Train.csv")
test = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Test.csv")

In [3]:
train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


### 📊 Dataset Info

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  735 non-null    int64 
 1   Text       735 non-null    object
dtypes: int64(1), object(1)
memory usage: 11.6+ KB


### 📈 Category Distribution

In [6]:
train['Category'].value_counts()

Category
sport            346
business         336
politics         274
entertainment    273
tech             261
Name: count, dtype: int64

## 🧹 Text Cleaning

In [7]:
import html
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def clean_text(text):
    tknzr = TweetTokenizer()
    text = html.unescape(text)
    text = re.sub(r"http\S+|www\S+|https\S+|@\S+|#\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    stop_words = set(stopwords.words("english"))
    tokens = tknzr.tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = " ".join(tokens).strip()
    return text

In [8]:
train["Text"] = train["Text"].apply(clean_text)
test["Text"] = test["Text"].apply(clean_text)

## 🔁 Stemming

In [9]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    stemmed_text = " ".join(stemmed_tokens)
    return stemmed_text

In [10]:
train["Text"] = train["Text"].apply(stem_text)
test["Text"] = test["Text"].apply(stem_text)

## 🔤 Encode Labels

In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['Category'].values)

## 📚 Load GloVe Embeddings

In [12]:
def load_glove(embedding_path):
    embeddings = {}
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [13]:
glove_path = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'
glove_embeddings = load_glove(glove_path)

## 🔁 Average Word Embedding for Initial Vector Representation

In [14]:
def text_to_vector(tokens, dim=100):
    vectors = [glove_embeddings.get(word, np.zeros(dim)) for word in tokens]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

In [15]:
X_train = np.array([text_to_vector(tokens) for tokens in train['Text']])
X_test = np.array([text_to_vector(tokens) for tokens in test['Text']])

## 🧾 Tokenization & Padding

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['Text'])
vocab_size = len(tokenizer.word_index) + 1

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_seq = tokenizer.texts_to_sequences(train['Text'])
X_test_seq = tokenizer.texts_to_sequences(test['Text'])

max_sequence_length = 150
X_train_seq = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_seq = pad_sequences(X_test_seq, maxlen=max_sequence_length)

## 💾 Embedding Matrix

In [18]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_embeddings:
        embedding_matrix[i] = glove_embeddings[word]

## 🧠 Build BiLSTM Model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, GlobalAveragePooling1D, Bidirectional

model = Sequential([
    Input((max_sequence_length,)),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

model.summary()

## 🏋️ Train Model

In [20]:
model.fit(
    X_train_seq,
    y_train,
    validation_split=0.2,
    epochs=10,
)

Epoch 1/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 211ms/step - accuracy: 0.5684 - loss: 1.1997 - val_accuracy: 0.8691 - val_loss: 0.4153
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 200ms/step - accuracy: 0.8973 - loss: 0.3493 - val_accuracy: 0.8490 - val_loss: 0.4350
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 217ms/step - accuracy: 0.9199 - loss: 0.2635 - val_accuracy: 0.9295 - val_loss: 0.2392
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 202ms/step - accuracy: 0.9414 - loss: 0.1895 - val_accuracy: 0.8993 - val_loss: 0.2888
Epoch 5/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 218ms/step - accuracy: 0.9330 - loss: 0.2025 - val_accuracy: 0.8960 - val_loss: 0.3221
Epoch 6/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 201ms/step - accuracy: 0.9343 - loss: 0.1873 - val_accuracy: 0.9430 - val_loss: 0.1983
Epoch 7/10
[1m38/38[0m [

<keras.src.callbacks.history.History at 0x7e7b0df3a980>

## 🧪 Evaluate Model

In [21]:
y_pred = model.predict(X_train_seq)
y_pred = np.argmax(y_pred, axis=1)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 78ms/step


In [22]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_train, y_pred)
f1 = f1_score(y_train, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

Accuracy: 0.9577181208053691
F1 Score: 0.9577583006941214


## 📤 Predict on Test Set

In [23]:
y_pred_test = model.predict(X_test_seq)
y_pred_test = np.argmax(y_pred_test, axis=1)
y_pred_test = label_encoder.inverse_transform(y_pred_test)

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 83ms/step


## 💾 Save Submission

In [24]:
final_data = {'ArticleId': test["ArticleId"], 'Category': y_pred_test}
submission = pd.DataFrame(data=final_data)
submission.to_csv('submission.csv', index=False)