# Implememting LSTM using Tensorflow

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
import pandas as pd

data = pd.read_json('./xl.json')
data.head()

In [None]:
#create arrays to store the headlines and labels
headlines = list(data['headline'])
labels = list(data['labels'])

In [None]:
#params
vocab_size = 10000
max_length = 120
emb_size = 64
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<oov>'
training_size = 20000

In [None]:
training_sentences = headlines[0:training_size]
training_labels = labels[0:training_size]

testing_sentences = headlines[training_size:]
testing_labels = labels[training_size:]

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_senquences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)


In [None]:
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

# Define the LSTM model with layers


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, emb_size, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))
    tf.keras.layers.Dense(32, activation = 'relu')
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
num_epochs = 10
history = model.fit(training_padded,
                   training_labels,
                   epochs = num_epochs,
                   validation_data = (testing_padded, testing_labels),
                   verbose = 2)


# Using CNN's to improve LSTM model

In [None]:
#NEW model
model = tf.keras.Sequential([
    tf.keras.Embedding(vocab_size, emb_size, input_length = max_length)
    tf.keras.Conv1D(64, 5, activation = 'relu')
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation = 'relu')
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
num_epochs = 10
history = model.fit(training_padded,
                   training_labels,
                   epochs = num_epochs,
                   validation_data = (testing_padded, testing_labels),
                   verbose = 2)

# Challenge 

In [3]:
data, info = tfds.load('yelp_polarity_reviews', with_info = True, as_supervised = True)

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\Denylson\tensorflow_datasets\yelp_polarity_reviews\0.2.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling yelp_polarity_reviews-train.tfrecord...:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling yelp_polarity_reviews-test.tfrecord...:   0%|          | 0/38000 [00:00<?, ? examples/s]

[1mDataset yelp_polarity_reviews downloaded and prepared to C:\Users\Denylson\tensorflow_datasets\yelp_polarity_reviews\0.2.0. Subsequent calls will reuse this data.[0m


In [4]:
train_data, test_data = data['train'], data['test']

train_sentences = []
train_labels = []

test_sentences = []
test_labels = []

In [5]:
for sent, label in train_data:
    train_sentences.append(str(sent.numpy().decode('utf8')))
    train_labels.append(label.numpy())

for sent, label in test_data:
    test_sentences.append(str(sent.numpy().decode('utf8')))
    test_labels.append(label.numpy())
    
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [6]:
vocab_size = 10000
emb_size = 32
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<oov>'

In [8]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_seq = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_seq, maxlen = max_length, truncating = trunc_type, padding = padding_type)

test_seq = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_seq, maxlen = max_length, truncating = trunc_type, padding = padding_type)

In [11]:
# Create the lstm and CNN model

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, emb_size, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 32)           320000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 120, 128)          49664     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 412,993
Trainable params: 412,993
Non-trainable params: 0
_________________________________________________________________


In [13]:
num_epochs = 10
history = model.fit(
    train_padded,
    train_labels,
    epochs = num_epochs,
    validation_data = (test_padded, test_labels),
    verbose = 2
)

Epoch 1/10


KeyboardInterrupt: 

In [None]:
 def plot_graphs(history, metrics):
        plt.plot(history.history[metrics])
        plt.plot(history.history['val_' + metrics])
        plt.xlabels("epochs")
        plt.ylabels(metrics)
        plt.legend([metrics, 'val_'+metrics])
        plt.show()
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')