In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
from google.colab import files
uploaded = files.upload()

df = pd.read_csv(list(uploaded.keys())[0])
print(df.shape)
df.head()


Saving news_balanced_categories.csv to news_balanced_categories.csv
(52746, 5)


Unnamed: 0,headline,category,short_description,authors,date
0,Kanye West's 'Erratic' Medication Dosage Repor...,Entertainment,The rapper has been open about his experiences...,Cole Delbyck,03-12-2016
1,Regina King Reveals Her Plan To Push Diversity...,Entertainment,The director and Oscar-winning actor discussed...,Ron Dicker,16-07-2021
2,A 13-Year-Old Girl Was Largely Responsible For...,Entertainment,,Lauren Duca,01-06-2015
3,YouTuber Jake Paul Charged With Trespassing In...,Entertainment,The YouTube star was charged with two misdemea...,David Moye,04-06-2020
4,A Woman Proposed To Her Girlfriend During Ryan...,Entertainment,,E. Oliver Whitney,15-03-2015


In [3]:
df['short_description'] = df['short_description'].fillna('')
df['text'] = (df['headline'].astype(str) + ' ' + df['short_description'].astype(str)).str.strip()
df = df.dropna(subset=['category'])
df = df[df['text'].str.strip() != '']

print("Cleaned shape:", df.shape)
print(df['category'].value_counts().head())

Cleaned shape: (52746, 6)
category
Entertainment     8791
Parenting         8791
Politics          8791
Style & Beauty    8791
Travel            8791
Name: count, dtype: int64


In [4]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)
df[['clean_text','category']].head()

Unnamed: 0,clean_text,category
0,kanye west erratic medication dosage reportedl...,Entertainment
1,regina king reveals plan push diversity cannes...,Entertainment
2,yearold girl largely responsible starting john...,Entertainment
3,youtuber jake paul charged trespassing looted ...,Entertainment
4,woman proposed girlfriend ryan gosling directo...,Entertainment


In [5]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
class_names = label_encoder.classes_
print("Classes:", len(class_names), class_names)

Classes: 6 ['Entertainment' 'Parenting' 'Politics' 'Style & Beauty' 'Travel'
 'Wellness']


In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42)

print("Train size:", len(X_train), "Validation size:", len(X_val))

Train size: 42196 Validation size: 10550


In [7]:
vocab_size = 20000
max_len = 120
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post', truncating='post')

print(X_train_pad.shape, X_val_pad.shape)

(42196, 120) (10550, 120)


In [17]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional

embedding_dim = 128
num_classes = len(class_names)

model = Sequential([
    Embedding(vocab_size, embedding_dim),
    Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
    Dense(256, activation='relu'),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')
])


In [18]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=1e-4),
    metrics=['accuracy']
)

model.build(input_shape=(None, max_len))
model.summary()

In [19]:
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=6,
    batch_size=64,
    callbacks=[es]
)

Epoch 1/6
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 780ms/step - accuracy: 0.2668 - loss: 1.6780 - val_accuracy: 0.6803 - val_loss: 0.9279
Epoch 2/6
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m529s[0m 730ms/step - accuracy: 0.7092 - loss: 0.8211 - val_accuracy: 0.8122 - val_loss: 0.5712
Epoch 3/6
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m475s[0m 720ms/step - accuracy: 0.8376 - loss: 0.5013 - val_accuracy: 0.8437 - val_loss: 0.4827
Epoch 4/6
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m483s[0m 731ms/step - accuracy: 0.8880 - loss: 0.3676 - val_accuracy: 0.8555 - val_loss: 0.4465
Epoch 5/6
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m477s[0m 723ms/step - accuracy: 0.9112 - loss: 0.2951 - val_accuracy: 0.8590 - val_loss: 0.4339
Epoch 6/6
[1m660/660[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m511s[0m 774ms/step - accuracy: 0.9249 - loss: 0.2484 - val_accuracy: 0.8656 - val_loss: 0.4354


In [21]:
y_pred = model.predict(X_val_pad)
y_pred_classes = np.argmax(y_pred, axis=1)

print("Validation Accuracy:", accuracy_score(y_val, y_pred_classes))
print("\nClassification Report:")
print(classification_report(y_val, y_pred_classes, target_names=class_names))
print(confusion_matrix(y_val,y_pred_classes))

[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 87ms/step
Validation Accuracy: 0.858957345971564

Classification Report:
                precision    recall  f1-score   support

 Entertainment       0.82      0.83      0.82      1758
     Parenting       0.87      0.82      0.84      1758
      Politics       0.89      0.88      0.88      1758
Style & Beauty       0.90      0.87      0.88      1759
        Travel       0.87      0.91      0.89      1759
      Wellness       0.82      0.86      0.84      1758

      accuracy                           0.86     10550
     macro avg       0.86      0.86      0.86     10550
  weighted avg       0.86      0.86      0.86     10550

[[1457   52  100   67   49   33]
 [  38 1435   34   48   40  163]
 [ 114   27 1545    3   41   28]
 [  96   31    6 1522   33   71]
 [  45   21   37   22 1593   41]
 [  35   84   22   29   78 1510]]


In [27]:
def predict_text(text):
    seq = tokenizer.texts_to_sequences([clean_text(text)])
    pad = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(pad)
    label_idx = np.argmax(pred)
    label = label_encoder.inverse_transform([label_idx])[0]
    conf = pred[0][label_idx]
    return label, conf
sample = "Virat Kohli is watching tv"
pred_label, confidence = predict_text(sample)
print(f"Predicted Category: {pred_label} (Confidence = {confidence:.3f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
Predicted Category: Entertainment (Confidence = 0.783)
