 !wget --no-check-certificate \
     https://storage.googleapis.com/laurencemoroney-blog.appspot.com/bbc-text.csv \
     -O /tmp/bbc-text.csv
     
90%

In [62]:
import csv
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalAveragePooling1D, Dense, BatchNormalization 
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import Callback

In [42]:
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [82]:
class myCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        acc=0.95
        if(logs.get('val_accuracy')>acc):
            print("\nReached {acc} accuracy so cancelling training!")
            self.model.stop_training = True

In [43]:

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
print(len(stopwords))
# Expected Output
# 153

153


In [5]:
labels=[]
sentences=[]
with open('/tmp/bbc_text/bbc-text.csv') as f:
    reader=csv.reader(f)
    next(reader)
    for row in reader:
        labels.append(row[0])
        sentence=row[1]
        for stopword in stopwords:
            token=f" {stopword} "
            sentence=sentence.replace(token, " ")
        sentences.append(sentence)
        

In [24]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

In [25]:
tokenizer.fit_on_texts(sentences)

In [26]:
n_words=len(tokenizer.word_index)
n_words

29714

In [31]:
seq=tokenizer.texts_to_sequences(sentences)

In [33]:
padded=pad_sequences(seq, maxlen=max_length, padding='pre')

In [44]:
padded=np.array(padded)

In [36]:
label_dict={}
for label in set(labels):
    label_dict[label]=len(label_dict)

In [45]:
int_labels = np.array(list(map(label_dict.get, labels)))

In [65]:
n=len(int_labels)
split_at=int(n*training_portion)
X_train=padded[:split_at]
y_train=int_labels[:split_at]
X_valid=padded[split_at:]
y_valid=int_labels[split_at:]

In [145]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    #Dense(16, activation='relu'),
    BatchNormalization(),
    Dense(5, activation='softmax')
])

In [146]:
model.summary()

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 120, 16)           16000     
_________________________________________________________________
global_average_pooling1d_20  (None, 16)                0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 16)                64        
_________________________________________________________________
dense_39 (Dense)             (None, 5)                 85        
Total params: 16,149
Trainable params: 16,117
Non-trainable params: 32
_________________________________________________________________


In [147]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(clipvalue=1.0), metrics=['accuracy'])

In [148]:
model.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[myCallback()])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Reached {acc} accuracy so cancelling training!


<tensorflow.python.keras.callbacks.History at 0x18d8699c948>

In [149]:
model.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x18d86ad6b88>