In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import spacy
from google.colab import drive

drive.mount('/content/drive')
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
df = pd.read_csv('/content/drive/MyDrive/Notebooks/bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [3]:
import re

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+','',text)
    text = re.sub(r'[^a-zA-Z0-9\s]','', text)
    text = re.sub(r'\s+', ' ', text).strip()

    doc = nlp(text)

    cleaned_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    res = " ".join(cleaned_text)
    return res

In [4]:
df['clean_text'] = df['text'].apply(clean_text)

In [5]:
df.head()

Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...
1,business,worldcom boss left books alone former worldc...,worldcom boss leave book worldcom boss bernie ...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester rush make ...
3,sport,yeading face newcastle in fa cup premiership s...,yeade face newcastle fa cup premiership newcas...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s raid box office ocean s crime caper se...


In [6]:
X = df['clean_text']
y = df['category']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [9]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
oov_tok = '<OOV>' #  Out of Vocabulary

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size,oov_token = oov_tok)
tokenizer.fit_on_texts(X_train)

In [11]:
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_test_tok = tokenizer.texts_to_sequences(X_test)

In [12]:
X_train_pad = pad_sequences(X_train_tok, maxlen = max_length)
X_test_pad = pad_sequences(X_test_tok, maxlen = max_length)

In [13]:
import tensorflow as tf

model = tf.keras.models.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
      tf.keras.layers.Dropout(0.2),

      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)), # walk the RNN from left to right and right to left
      tf.keras.layers.Dropout(0.2),

      tf.keras.layers.Dense(embedding_dim, activation='relu'),
      tf.keras.layers.Dense(5, activation='softmax')
    ])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(X_train_pad, y_train, epochs=20, validation_data=(X_test_pad, y_test))

Epoch 1/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 236ms/step - accuracy: 0.2635 - loss: 1.5734 - val_accuracy: 0.4404 - val_loss: 1.2216
Epoch 2/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 224ms/step - accuracy: 0.4842 - loss: 1.0967 - val_accuracy: 0.6562 - val_loss: 0.8541
Epoch 3/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 232ms/step - accuracy: 0.7004 - loss: 0.7181 - val_accuracy: 0.5685 - val_loss: 0.8715
Epoch 4/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 210ms/step - accuracy: 0.6804 - loss: 0.6771 - val_accuracy: 0.6854 - val_loss: 0.7378
Epoch 5/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 229ms/step - accuracy: 0.7836 - loss: 0.5300 - val_accuracy: 0.7888 - val_loss: 0.5898
Epoch 6/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 193ms/step - accuracy: 0.8945 - loss: 0.2746 - val_accuracy: 0.8629 - val_loss: 0.4540
Epoch 7/20
[1m56/56[

<keras.src.callbacks.history.History at 0x7f4c08414340>

In [15]:
# get the class for every unique lable
for i,lable in enumerate(le.classes_):
  print(i,lable)

0 business
1 entertainment
2 politics
3 sport
4 tech


In [17]:
txt = ["blair prepares to name poll date tony blair is likely to name 5 may as election day when parliament returns from its easter break  the bbc s political editor has learned.  andrew marr says mr blair will ask the queen on 4 or 5 april to dissolve parliament at the end of that week. mr blair has so far resisted calls for him to name the day but all parties have stepped up campaigning recently. downing street would not be drawn on the claim  saying election timing was a matter for the prime minister.  a number 10 spokeswoman would only say:  he will announce an election when he wants to announce an election.  the move will signal a frantic week at westminster as the government is likely to try to get key legislation through parliament. the government needs its finance bill  covering the budget plans  to be passed before the commons closes for business at the end of the session on 7 april.  but it will also seek to push through its serious and organised crime bill and id cards bill. mr marr said on wednesday s today programme:  there s almost nobody at a senior level inside the government or in parliament itself who doesn t expect the election to be called on 4 or 5 april.  as soon as the commons is back after the short easter recess  tony blair whips up to the palace  asks the queen to dissolve parliament ... and we re going.  the labour government officially has until june 2006 to hold general election  but in recent years governments have favoured four-year terms."]

seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_length)
pred = model.predict(padded)
labels = ['bussiness', 'entertainment', 'politics', 'sport', 'tech']

print(pred)
print(np.argmax(pred))
print(labels[np.argmax(pred)])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[[4.0879759e-06 2.5460112e-04 9.9960679e-01 8.5843340e-06 1.2595943e-04]]
2
politics
