In [1]:
!pip install nltk



In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from nltk.stem import PorterStemmer
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords



print(tf.__version__)

2.9.1


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# The maximum number of words to be used. (most frequent)
vocab_size = 5000

# Dimension of the dense embedding.
embedding_dim = 128

# Max number of words in each complaint.
max_length = 100

# Truncate and padding options
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [4]:
dataset = pd.read_csv('BBC News Train.csv')

In [5]:
dataset.columns

Index(['Text', 'Category'], dtype='object')

In [6]:
# Lets do some text cleanup
stemmer = PorterStemmer()

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
    text: a string
    return: modified initial string
    """
    # lowercase text
    text = text.lower() 

    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    
    # Remove the XXXX values
    text = text.replace('x', '') 
    
    # Remove white space
    text = REMOVE_NUM.sub('', text)

    #  delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text) 

    # delete stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    
    # removes any words composed of less than 2 or more than 21 letters
    text = ' '.join(word for word in text.split() if (len(word) >= 2 and len(word) <= 21))

    # Stemming the words
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    return text

In [7]:
dataset["Text"] = dataset["Text"].apply(clean_text)
dataset["Text"]

0       worldcom eboss launch defenc lawyer defend for...
1       german busi confid slide german busi confid fe...
2       bbc poll indic econom gloom citizen major nati...
3       lifestyl govern mobil choic faster better funk...
4       enron boss payout eighteen former enron direct...
                              ...                        
1485    doubl evict big brother model capric holbi cit...
1486    dj doubl act revamp chart show dj duo jk joel ...
1487    weak dollar hit reuter revenu media group reut...
1488    appl ipod famili epand market appl epand ipod ...
1489    santi worm make unwelcom visit thousand websit...
Name: Text, Length: 1490, dtype: object

In [8]:
# Shuffel the dataset to make sure we get an equal distribution of the data before splitting into train and test sets
dataset = dataset.sample(frac=1)

In [9]:
News = dataset["Text"].values
labels = dataset[["Category"]].values

X_train, X_test,y_train, y_test = train_test_split(News,labels, test_size = 0.20, random_state = 42)
print(X_train.shape,y_train.shape)


(1192,) (1192, 1)


In [10]:
y_test[0]

array(['tech'], dtype=object)

In [11]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
dict(list(word_index.items())[0:10])


Found 15914 unique tokens.


{'<OOV>': 1,
 'said': 2,
 'mr': 3,
 'year': 4,
 'would': 5,
 'also': 6,
 'peopl': 7,
 'new': 8,
 'us': 9,
 'one': 10}

In [12]:
train_seq = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_seq = tokenizer.texts_to_sequences(X_test)
validation_padded = pad_sequences(validation_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print('Shape of data tensor:', train_padded.shape)
print('Shape of data tensor:', validation_padded.shape)

Shape of data tensor: (1192, 100)
Shape of data tensor: (298, 100)


In [13]:
encode = OneHotEncoder()

training_labels = encode.fit_transform(y_train)
validation_labels = encode.transform(y_test)

In [14]:
training_labels[19]

<1x5 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [15]:
training_labels = training_labels.toarray()
validation_labels = validation_labels.toarray()

print(type(training_labels))
print(type(validation_labels))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [32]:
train_padded.shape[1]

100

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=train_padded.shape[1]))

model.add(Conv1D(48, 5, activation='relu', padding='valid'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dropout(0.5))

model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 100
batch_size = 32

history = model.fit(train_padded, training_labels, shuffle=True ,
                    epochs=epochs, batch_size=batch_size, 
                    validation_split=0.2,
                    callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001), 
                               EarlyStopping(monitor='val_loss', mode='min', patience=2, verbose=1),
                               EarlyStopping(monitor='val_accuracy', mode='max', patience=5, verbose=1)])

2022-08-18 13:20:29.488288: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 23: early stopping


In [17]:
# First we create an evaluation function to output all the needs metrics

def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels
    on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds, average='micro')
    recall = recall_score(y_true, y_preds, average='micro')
    f1 = f1_score(y_true, y_preds, average='micro')
    metric_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

In [18]:
# Now we make predictions using the test data to see how the model performs

predicted = model.predict(validation_padded)
evaluate_preds(np.argmax(validation_labels, axis=1), np.argmax(predicted, axis=1))

Acc: 90.94%
Precision: 0.91
Recall: 0.91
F1 score: 0.91


{'accuracy': 0.91, 'precision': 0.91, 'recall': 0.91, 'f1': 0.91}

In [19]:
def inference(text):
    seq = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    pred = model.predict(padded)
  
    predicted_label = encode.inverse_transform(pred)
    
    return  np.argmax(pred[0]), predicted_label[0][0]

In [30]:
X_test[10]

'eastend set us remak plan creat us soap base bbc eastend reportedli drawn fo tv network eastend head writer toni jordan music mogul simon fuller involv project accord report hollywood report trade newspap said script commiss seri commun work class peopl chicago origin eastend pull bbc america last year prove failur rate us version british hit prove less success across atlant bbc comedi coupl remad us cast lost primetim slot nbc network due disappoint rate home eastend face rate battl recent lose rival itv soap emmer dale primetim soap us televis made recent comeback follow success abc serial desper housew seri take darkli comed look goingson group charact live suburb'

In [27]:
inference(X_test[1])



(3, 'sport')

In [28]:
y_test[1]

array(['tech'], dtype=object)

In [23]:
X_test[0]

'appl laptop greatest gadget appl powerbook chosen greatest gadget time us magazin mobil pc laptop chosen one first lightweight portabl comput help defin layout futur notebook pc magazin compil alltim top list gadget includ soni walkman number three zenith remot control two gadget need move part electron warrant inclus magazin staff compil list specifi gadget also need selfcontain apparatu use subset anoth devic gener includ item potenti mobil said magazin end tri get heart realli make gadget gadget conclud oldest gadget top abacu magazin date ad put th place preelectron gadget top includ setant th posit marin chronomet nd posit kodak browni camera th posit tivo person video record newest devic make top also includ first flash mp player diamond multimedia well first success digit camera casio qv mobil phone motorola startac popular gadget moment appl ipod number list first soni transistor radio number soni third entri top cdp cd player forget crystallin hissfre blast madonna like virgi

In [24]:
history

<keras.callbacks.History at 0x7fd47db54950>