In [154]:
import gzip
import tarfile
import pandas as pd
import os
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten
from keras.utils import to_categorical

In [142]:
data_dir = r'C:\Users\David\Documents\code\Mod_5\5-7-rnn\dataset\aclImdb'

def read_imdb(data_dir, is_train):
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                   label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data, train_labels = read_imdb(data_dir, is_train=True)
test_data, test_labels = read_imdb(data_dir, is_train=False)

# 1. Sentiment analysis

Using the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), we want to do a regression model that predict the ratings are on a 1-10 scale. You have an example train and test set in the `dataset` folder.

### 1.1 Regression Model

Use a feedforward neural network and NLP techniques we've seen up to now to train the best model you can on this dataset

### 1.2 RNN model

Train a RNN to do the sentiment analysis regression. The RNN should consist simply of an embedding layer (to make word IDs into word vectors) a recurrent blocks (GRU or LSTM) feeding into an output layer.

In [143]:
# Let's get both datasets numerized and padded right from the beggining.
tokenize = Tokenizer(lower = True)
tokenize.fit_on_texts(train_data)
X_coded = tokenize.texts_to_sequences(train_data)
X_train = pad_sequences(X_coded, padding='post')
y_train = train_labels

tokenize_ = Tokenizer(lower = True)
tokenize_.fit_on_texts(test_data)
X_coded = tokenize_.texts_to_sequences(test_data)
X_test = pad_sequences(X_coded, padding='post')
y_test = test_labels

In [144]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [146]:
X_train, X_null, y_train, Y_null = train_test_split(X_train, y_train, test_size=0.2) #Just an easy way to kill some data

X_null, X_test, Y_null, y_test = train_test_split(X_train, y_train, test_size=0.2) #Just an easy way to kill some data

In [155]:
#1.1
vocabulary_size = len(tokenize.word_counts.keys())+1
max_words = len(max((X_train), key=len))
embedding_size = 100

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

model.add(Flatten())
model.add(Dense(2, activation='softmax'))

model.compile(
    optimizer='sgd',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x14ab42f9d30>

In [156]:
#Now for the test
pred = np.argmax(model.predict(X_test), axis=-1)
y = np.argmax(y_test, axis=-1)
accuracy_score(pred, y)

0.736

In [157]:
#1.2
vocabulary_size = len(tokenize.word_counts.keys())+1
max_words = len(max((X_train), key=len))
embedding_size = 100

model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(200))
model.add(Dense(2, activation='softmax'))

model.compile(
    optimizer='sgd',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

model.fit(X_train, y_train, epochs=2, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x14ab5344370>

In [158]:
#Now for the test
pred = np.argmax(model.predict(X_test), axis=-1)
y = np.argmax(y_test, axis=-1)
accuracy_score(pred, y)

0.50032

# 2. (evil) XOR Problem

Train an LSTM to solve the XOR problem: that is, given a sequence of bits, determine its parity. The LSTM should consume the sequence, one bit at a time, and then output the correct answer at the sequence’s end. Test the two approaches below:

### 2.1 

Generate a dataset of random <=100,000 binary strings of equal length <= 50. Train the LSTM; what is the maximum length you can train up to with precisison?
    

### 2.2

Generate a dataset of random <=200,000 binary strings, where the length of each string is independently and randomly chosen between 1 and 50. Train the LSTM. Does it succeed? What explains the difference?


In [232]:
#2.1

#I'll be honest, I tried everything right up to 50 columns, my bad NN still hits the same accuracy.
XOs = np.zeros((100000, 30), dtype=float)
labels = np.zeros(100000)

for row in range(100000):
    XOs[row] = np.random.randint(0, 2, 30)
    labels[row] = XOs[row].sum() % 2 # simple math trick that gives the correct label.

X = XOs
y = to_categorical(labels)

In [246]:
model=Sequential()
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(10))
model.add(Dense(2, activation='softmax'))

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

model.fit(X.reshape((len(X), 30, 1)), y, epochs=5, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x14ae7ebae80>

In [366]:
#2.2
#This is the setup I started with. Its works at generating random length rows, but the keras model just wouldn't take samples of varying sizes.

XOs = []

for row in range(200000):
    num = np.random.randint(0, 50)
    temp = np.zeros_like(range(num))
    XOs.append(np.zeros_like(range(num)))

labels = np.zeros(200000)

for row in range(200000):
    XOs[row] = np.random.randint(0, 2, len(XOs[row]))
    labels[row] = XOs[row].sum() % 2 # simple math trick that gives the correct label.

X = XOs
y = to_categorical(labels)

In [376]:
#This (technically) fits the question, but it is not random lengths. The integers are assigned at random length, meaning that the array is zero-padded with random lengths to the paddings.

XOs = np.zeros((100000, 50), dtype=float)
labels = np.zeros(100000)

for row in range(100000):
    num = np.random.randint(0, 50)
    for i in range(np.random.randint(0, 50)):
        XOs[row][i] = np.random.randint(0, 2)
    labels[row] = XOs[row].sum() % 2 # simple math trick that gives the correct label.

X = XOs
y = to_categorical(labels)

In [377]:
X = XOs
y = to_categorical(labels)

In [380]:
model=Sequential()
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(10))
model.add(Dense(2, activation='softmax'))

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

model.fit(X.reshape((len(X), 50, 1)), y, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x14b18288850>