In [None]:
import csv
import random
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers


embedding_dim = 100
max_length = 16
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 160000
test_portion = .1

corpus = []


In [None]:
# Note that I cleaned the Stanford dataset to remove LATIN1 encoding to make it easier for Python CSV reader
# You can do that yourself with:
# iconv -f LATIN1 -t UTF8 training.1600000.processed.noemoticon.csv -o training_cleaned.csv
# I then hosted it on my site to make it easier to use in this notebook

import os
import requests

path = os.path.join('dataset')
filename = "training_cleaned.csv"
filepath = os.path.join(path, filename)

if not os.path.exists(path):
    os.makedirs(path)

if not os.path.exists(filepath):
    url = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv"
    jfile = requests.get(url)
    open(filepath, 'wb').write(jfile.content)


In [None]:
import pandas as pd
num_sentences = 0

data = pd.read_csv('dataset/training_cleaned.csv', header=None)

data.loc[data.loc[:, 0] == 4, 0] = 1
data.loc[data.loc[:, 0] == 0, 0] = 0

corpus = np.array(data.loc[:, [5, 0]])
num_sentences = data.shape[0]

# with open("dataset/training_cleaned.csv") as csvfile:
#     reader = csv.reader(csvfile)
#     for row in reader:
#       # Your Code here. Create list items where the first item is the text, found in row[5],
#       # and the second is the label. Note that the label is a '0' or a '4' in the text.
#       # When it's the former, make your label to be 0, otherwise 1. Keep a count of the number
#       # of sentences in num_sentences
#         list_item=["",0]
#         # YOUR CODE HERE
#         list_item[0]=row[5]
#         if row[0]:
#           list_item[1]=1
#         else:
#           list_item[1]=0
#         num_sentences = num_sentences + 1
#         corpus.append(list_item)


In [None]:
print(num_sentences)
print(len(corpus))
print(corpus[1])

# Expected Output:
# 1600000
# 1600000
# ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!", 0]


In [None]:
sentences=[]
labels=[]
random.shuffle(corpus)
for x in range(training_size):
    sentences.append(corpus[x][0])
    labels.append(corpus[x][1])


tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size=len(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

split = int(test_portion * training_size)

test_sequences = np.array(padded[0:split])
training_sequences = np.array(padded[split:training_size])
test_labels = np.array(labels[0:split])
training_labels = np.array(labels[split:training_size])


In [None]:
print(vocab_size)
print(word_index['i'])
# Expected Output
# 138858
# 1


In [None]:
# Note this is the 100 dimension version of GloVe from Stanford
# I unzipped and hosted it on my site to make this notebook easier

path = os.path.join('dataset')
filename = "glove.6B.100d.txt"
filepath = os.path.join(path, filename)

if not os.path.exists(path):
    os.makedirs(path)

if not os.path.exists(filepath):
    url = "https://resources.oreilly.com/conferences/natural-language-processing-with-deep-learning/-/raw/master/data/glove.6B.100d.txt?inline=false"
    jfile = requests.get(url)
    print("downloading...")
    open(filepath, 'wb').write(jfile.content)

embeddings_index = {}
with open('dataset/glove.6B.100d.txt',encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector


In [None]:
print(len(embeddings_matrix))
# Expected Output
# 138859

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length,
                              weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Conv1D(64,5, activation='relu'),
    tf.keras.layers.MaxPolling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(optimizer="adam",loss="binary_crossentropy", metrics=['accuracy'])
model.summary()

num_epochs=50
history=model.fit(training_sequences, training_labels, epochs=num_epochs,
                  validation_data=(test_sequences, test_labels), verbose=2)

print("Training Complete")


In [None]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt

# -----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
# -----------------------------------------------------------
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))  # Get number of epochs

# ------------------------------------------------
# Plot training and validation accuracy per epoch
# ------------------------------------------------
plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])

plt.figure()

# ------------------------------------------------
# Plot training and validation loss per epoch
# ------------------------------------------------
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])

plt.figure()


# Expected Output
# A chart where the validation loss does not increase sharply!
