# 📰 BBC News Text Classification with BiLSTM

This notebook builds a text classification model to predict the category of BBC news articles using deep learning.  
Key steps include:
- Data preprocessing
- Text cleaning and stemming
- GloVe word embedding
- Tokenization and padding
- BiLSTM model training
- Evaluation and prediction

In [None]:
import numpy as np
import pandas as pd
import re

## 📥 Load Dataset

In [None]:
train = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Train.csv")
test = pd.read_csv("/kaggle/input/learn-ai-bbc/BBC News Test.csv")

In [None]:
train.head()

### 📊 Dataset Info

In [None]:
train.info()

In [None]:
test.info()

### 📈 Category Distribution

In [None]:
train['Category'].value_counts()

## 🧹 Text Cleaning

In [None]:
import html
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def clean_text(text):
    tknzr = TweetTokenizer()
    text = html.unescape(text)
    text = re.sub(r"http\S+|www\S+|https\S+|@\S+|#\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    stop_words = set(stopwords.words("english"))
    tokens = tknzr.tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = " ".join(tokens).strip()
    return text

In [None]:
train["Text"] = train["Text"].apply(clean_text)
test["Text"] = test["Text"].apply(clean_text)

## 🔁 Stemming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    stemmed_text = " ".join(stemmed_tokens)
    return stemmed_text

In [None]:
train["Text"] = train["Text"].apply(stem_text)
test["Text"] = test["Text"].apply(stem_text)

## 🔤 Encode Labels

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['Category'].values)

## 📚 Load GloVe Embeddings

In [None]:
def load_glove(embedding_path):
    embeddings = {}
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [None]:
glove_path = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'
glove_embeddings = load_glove(glove_path)

## 🔁 Average Word Embedding for Initial Vector Representation

In [None]:
def text_to_vector(tokens, dim=100):
    vectors = [glove_embeddings.get(word, np.zeros(dim)) for word in tokens]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

In [None]:
X_train = np.array([text_to_vector(tokens) for tokens in train['Text']])
X_test = np.array([text_to_vector(tokens) for tokens in test['Text']])

## 🧾 Tokenization & Padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['Text'])
vocab_size = len(tokenizer.word_index) + 1

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_seq = tokenizer.texts_to_sequences(train['Text'])
X_test_seq = tokenizer.texts_to_sequences(test['Text'])

max_sequence_length = 150
X_train_seq = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_seq = pad_sequences(X_test_seq, maxlen=max_sequence_length)

## 💾 Embedding Matrix

In [None]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in glove_embeddings:
        embedding_matrix[i] = glove_embeddings[word]

## 🧠 Build BiLSTM Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, LSTM, GlobalAveragePooling1D, Bidirectional

model = Sequential([
    Input((max_sequence_length,)),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

model.summary()

## 🏋️ Train Model

In [None]:
model.fit(
    X_train_seq,
    y_train,
    validation_split=0.2,
    epochs=10,
)

## 🧪 Evaluate Model

In [None]:
y_pred = model.predict(X_train_seq)
y_pred = np.argmax(y_pred, axis=1)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(y_train, y_pred)
f1 = f1_score(y_train, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("F1 Score:", f1)

## 📤 Predict on Test Set

In [None]:
y_pred_test = model.predict(X_test_seq)
y_pred_test = np.argmax(y_pred_test, axis=1)
y_pred_test = label_encoder.inverse_transform(y_pred_test)

## 💾 Save Submission

In [None]:
final_data = {'ArticleId': test["ArticleId"], 'Category': y_pred_test}
submission = pd.DataFrame(data=final_data)
submission.to_csv('submission.csv', index=False)