In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import spacy
from google.colab import drive

drive.mount('/content/drive')
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
df = pd.read_csv('/content/drive/MyDrive/Notebooks/bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [47]:
df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
sport,511
business,510
politics,417
tech,401
entertainment,386


In [48]:
import re

nlp = spacy.load('en_core_web_sm')


def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    doc = nlp(text)

    cleaned_text = [
        token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    res = " ".join(cleaned_text)
    return res

In [49]:
df['clean_text'] = df['text'].apply(clean_text)

In [50]:
df.head()

Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...
1,business,worldcom boss left books alone former worldc...,worldcom boss leave book worldcom boss bernie ...
2,sport,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester rush make ...
3,sport,yeading face newcastle in fa cup premiership s...,yeade face newcastle fa cup premiership newcas...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s raid box office ocean s crime caper se...


In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['category'], test_size=0.2, random_state=42)

i will use Embedding for training <br>
so i should convert every text to vector of numbers every number will refer to a word i will use <b>Tokenizer<b>


In [52]:
from tensorflow.keras.preprocessing.text import Tokenizer

oov_tok = "<OOV>"  # if the word is not exist give it 1
vocab_size = 10000
# give an index for every word
tokanizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokanizer.fit_on_texts(X_train)
word_idx = tokanizer.word_index

In [53]:
word_idx

{'<OOV>': 1,
 's': 2,
 'say': 3,
 'mr': 4,
 'year': 5,
 'm': 6,
 'new': 7,
 'people': 8,
 'good': 9,
 'win': 10,
 'game': 11,
 'time': 12,
 't': 13,
 'come': 14,
 'world': 15,
 'play': 16,
 'government': 17,
 'company': 18,
 'film': 19,
 'go': 20,
 'tell': 21,
 'uk': 22,
 'work': 23,
 'take': 24,
 'firm': 25,
 'think': 26,
 'want': 27,
 'add': 28,
 'include': 29,
 'music': 30,
 'service': 31,
 'like': 32,
 '000': 33,
 'month': 34,
 'plan': 35,
 'number': 36,
 'set': 37,
 'market': 38,
 'way': 39,
 'player': 40,
 'week': 41,
 'country': 42,
 'report': 43,
 'need': 44,
 'party': 45,
 'labour': 46,
 'bbc': 47,
 'look': 48,
 'home': 49,
 'big': 50,
 'sale': 51,
 'minister': 52,
 'day': 53,
 'help': 54,
 'million': 55,
 'election': 56,
 'technology': 57,
 'expect': 58,
 'rise': 59,
 'use': 60,
 'second': 61,
 'find': 62,
 'right': 63,
 'get': 64,
 'mobile': 65,
 '2004': 66,
 'england': 67,
 'know': 68,
 'start': 69,
 'group': 70,
 'award': 71,
 'give': 72,
 'see': 73,
 'high': 74,
 'offer':

In [54]:
# convert the text to a vector of numbers the numbers is the word index
X_train_tokanized = tokanizer.texts_to_sequences(X_train)
X_test_tokanized = tokanizer.texts_to_sequences(X_test)

In [55]:
X_train_tokanized[0]

[3272,
 103,
 814,
 400,
 2890,
 3272,
 814,
 539,
 1797,
 6816,
 6817,
 144,
 789,
 569,
 1107,
 1634,
 19,
 90,
 544,
 16,
 203,
 257,
 2998,
 2999,
 2518,
 7716,
 1189,
 1,
 6818,
 1290,
 438,
 3586,
 511,
 469,
 258,
 3272,
 2,
 1189,
 3,
 170,
 1,
 6819,
 6820,
 90,
 201,
 400,
 2,
 1189,
 366,
 511,
 362,
 47,
 374,
 1,
 1399,
 8960,
 346,
 697,
 1008,
 14,
 83,
 486,
 789,
 2,
 1,
 90,
 2891,
 257,
 294,
 436,
 19,
 2297,
 43,
 80,
 1,
 7717,
 3272,
 8961,
 16,
 257,
 1,
 19,
 363,
 6821,
 348,
 1008,
 5559,
 1463,
 6817,
 362,
 3950,
 156,
 88,
 803,
 408,
 94,
 539,
 1,
 725,
 70,
 1125,
 1705,
 19,
 90,
 362,
 6817,
 29,
 6822,
 1876,
 790,
 400,
 2231,
 4169,
 1,
 595,
 1290,
 79,
 90,
 6819,
 6820,
 544,
 73,
 50,
 511,
 4438,
 193,
 595]

In [56]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 200
# padding every record into specific lenght 200 (max_length)
X_train_padded = pad_sequences(X_train_tokanized, maxlen=max_len)
X_test_padded = pad_sequences(X_test_tokanized, maxlen=max_len)

In [57]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation, Embedding, Bidirectional, SimpleRNN
import tensorflow as tf

model = Sequential([
    # num.of unique words in dataset, embedding_dim, len of data -> make every word will have 64 feature
    Embedding(input_dim=vocab_size, output_dim=64,
              input_length=len(X_train_padded)),
    Dropout(0.2),

    SimpleRNN(64, return_sequences=True),
    Dropout(0.2),

    SimpleRNN(64),
    Dropout(0.2),

    Dense(64, activation='relu'),
    Dropout(0.2),

    Dense(5, activation='softmax')
])

In [59]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
# sparse_categorical_crossentropy best for lable encoder
# categorical_crossentropy best for one hot encoder

In [60]:
model.fit(X_train_padded, y_train_encoded, epochs=20,
          validation_data=(X_test_padded, y_test_encoded))

Epoch 1/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 128ms/step - accuracy: 0.2279 - loss: 1.6302 - val_accuracy: 0.2472 - val_loss: 1.5910
Epoch 2/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 195ms/step - accuracy: 0.3511 - loss: 1.4570 - val_accuracy: 0.4090 - val_loss: 1.2681
Epoch 3/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 160ms/step - accuracy: 0.5729 - loss: 1.0623 - val_accuracy: 0.4921 - val_loss: 1.2070
Epoch 4/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 178ms/step - accuracy: 0.8530 - loss: 0.4900 - val_accuracy: 0.5685 - val_loss: 1.2685
Epoch 5/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 117ms/step - accuracy: 0.9718 - loss: 0.1365 - val_accuracy: 0.6045 - val_loss: 1.2752
Epoch 6/20
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 143ms/step - accuracy: 0.9921 - loss: 0.0512 - val_accuracy: 0.6067 - val_loss: 1.3300
Epoch 7/20
[1m56/56[0

<keras.src.callbacks.history.History at 0x7c61df254a30>