### Testing former models with test_random data
---------
_How data is collected:_ Until now, we were using uncertainty sampling to label EDUs and use those EDUs as train and test data. This approach is problematic because test data doesn't represent the overall EDUs, a.k.a real world.

Instead, I took 1000 random data points from UNLABELED EDUs, and labeled them.

In [54]:
import labeled_functions
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
from matplotlib import colors

import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.cross_validation import cross_val_score, StratifiedKFold # Difference? (indices=None, or nothing)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

import keras
from keras import regularizers
from keras.models import Model, save_model, load_model
from keras.layers import Dense, LSTM, Input, Embedding, Flatten, Dropout
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence, one_hot
from keras.preprocessing.sequence import pad_sequences

### LR, MNB
labeledfunctions.load_labeled_neutrals() is modified.

In [55]:
X_corpus, y, test_corpus, test_y = labeled_functions.load_labeled_neutrals(path=r"./../../Fall_18/edu/active_learning/")

Labeled Data loaded.
Test Data loaded.


In [56]:
len(test_corpus)

709

In [57]:
token = r"(?u)\b[\w\'/]+\b"
def vectorize(ngram=(1,3), stop=["the","a","of","and","br","to"]):
    return CountVectorizer(token_pattern=token, binary=True, ngram_range=ngram, stop_words=stop)


vectorizer_one = vectorize(stop=["a","of","and","br","to"])
X_vector = vectorizer_one.fit_transform(X_corpus)
test_vector = vectorizer_one.transform(test_corpus)

In [59]:
test_vector.toarray()[3].nonzero()

(array([ 3274, 13731, 14231, 14282, 14318, 27021, 27106, 33526, 33602,
        33604, 36729, 38869, 39346, 62401, 65923, 65928]),)

In [60]:
lr = LogisticRegression()
lr.fit(X_vector,y)
lr.score(test_vector,test_y)

0.7207334273624824

In [61]:
pred = lr.predict(test_vector)
confusion_matrix(test_y, pred, labels=[-1,0,1])

array([[238,  13,   7],
       [ 77,  31,  76],
       [ 20,   5, 242]])

In [62]:
grams=[(1,1),(1,2),(1,3),(1,4)]

for gram in grams:
    print(gram)
    vectorizer = vectorize(ngram=gram, stop=["a","of","and","br","to"])
    X_vector = vectorizer.fit_transform(X_corpus)
    test_vector = vectorizer.transform(test_corpus)
    
    models = [LogisticRegression(), MultinomialNB()]
    
    for model in models:
        model.fit(X_vector, y)
        print(model.score(test_vector, test_y))

(1, 1)
0.7023977433004231
0.7052186177715092
(1, 2)
0.7165021156558533
0.7108603667136812
(1, 3)
0.7207334273624824
0.7080394922425952
(1, 4)
0.7179125528913963
0.7122708039492243


In [8]:
vectorizer=vectorize(ngram=(1,3), stop=["a","of","and","br","to"])
X_vector = vectorizer.fit_transform(X_corpus)
test_vector = vectorizer.transform(test_corpus)


lr.fit(X_vector,y)
print(lr.score(test_vector,test_y))

# Find biggest coefficients.

for i in range(3):
    print(lr.classes_[i])
    

    inds = np.argsort(np.abs(lr.coef_[i]))[::-1]

    print(inds)
    
    for j in inds[:20]:
        print("%s \t %0.2f" %(vectorizer.get_feature_names()[j], lr.coef_[i][j]))
    
    print()

0.7258347978910369
-1
[ 4975  8422 15015 ... 65084 62115 27423]
awful 	 2.34
boring 	 2.24
dull 	 2.23
not bad 	 -2.10
beautiful 	 -1.99
annoying 	 1.77
ridiculous 	 1.73
fails 	 1.72
waste 	 1.71
great 	 -1.71
rare 	 -1.67
fantastic 	 -1.67
excellent 	 -1.64
worst 	 1.62
poorly 	 1.61
pointless 	 1.60
predictable 	 1.55
loved 	 -1.55
terrible 	 1.54
unfunny 	 1.50

0
[19904  8422  7345 ... 47693   924 53926]
friend 	 1.82
boring 	 -1.63
best friend 	 1.41
dull 	 -1.31
beloved 	 1.30
unsurprisingly 	 1.29
surprisingly 	 1.28
him 	 1.22
funeral 	 1.18
guy 	 1.18
rarely 	 1.17
awful 	 -1.17
performance 	 -1.06
noir 	 1.02
it is not 	 1.01
waste 	 -1.00
my 	 -0.99
7/10 	 -0.97
were 	 -0.97
acting 	 -0.97

1
[38912 43252 16733 ... 21820 63409 34075]
not bad 	 2.64
poor 	 -2.40
excellent 	 2.03
7/10 	 2.03
amazing 	 1.90
fascinating 	 1.84
beautiful 	 1.82
8/10 	 1.81
fantastic 	 1.79
great 	 1.78
perfectly 	 1.70
10/10 	 1.65
unfortunately 	 -1.63
awful 	 -1.60
fun 	 1.60
gem 	 1.58
enjoya

## FFNN
With the new data

In [63]:
y_cat, test_y_cat = to_categorical(y,num_classes=3), to_categorical(test_y,num_classes=3)
y[:5], y_cat[:5], test_y_cat[5:10]

(array([ 0, -1,  0, -1,  1]), array([[1., 0., 0.],
        [0., 0., 1.],
        [1., 0., 0.],
        [0., 0., 1.],
        [0., 1., 0.]], dtype=float32), array([[0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.]], dtype=float32))

In [74]:
# Combine train and test to one-hot-encode. Then split back.
test_len = len(test_corpus)
X_full = np.concatenate((X_corpus, test_corpus), axis=0)
y_full = np.concatenate((y, test_y), axis=0)

# Maximum sequence length
X_sequence = []
for i in X_full:
    X_sequence.append(text_to_word_sequence(i))
max_length = len(max(X_sequence,key=len))

X_corpus_one_hot = []
for i in X_full:
    X_corpus_one_hot.append(one_hot(i, round(max_length*1.1)))

X_corpus_one_hot[1:3]

[[55, 56, 144, 116, 136, 31, 52, 75, 79, 101, 39], [132, 72, 57, 20, 4]]

In [65]:
padded_seq = pad_sequences(X_corpus_one_hot)
X_one_hot, test_one_hot = padded_seq[:-test_len], padded_seq[-test_len:]
len(X_one_hot), len(test_one_hot)

(5232, 709)

In [75]:
# Model setup
input_nodes= Input(shape=(X_one_hot.shape[1],))
e = Embedding(round(max_length*1.1),
              100,
              input_length=X_one_hot.shape[1],
              trainable=True)(input_nodes)
flat= Flatten()(e)
dense1 = Dense(100, activation='tanh', kernel_regularizer=regularizers.l2(0.1))(flat)
# drop = Dropout(0.2)(dense1)
dense2 = Dense(10, activation='tanh')(dense1)
# drop2 = Dropout(0.2)(dense2)

# dense2 = Dense(30, activation='sigmoid')(dense1)

output_nodes=Dense(3, activation='softmax')(dense2)
# output_nodes=Dense(1, activation='sigmoid')(dense1)

#Build model
model = Model(inputs=input_nodes, outputs=output_nodes)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [76]:
model.fit(X_one_hot, y_cat, batch_size=64, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x13106aeb8>

In [77]:
model.evaluate(test_one_hot, test_y_cat)



[1.384461901251789, 0.51198871654415]

In [14]:
model.evaluate(test_one_hot, test_y_cat)



[1.3962477732626541, 0.536028118669882]

[1.3962477732626541, 0.536028118669882]

In [51]:
model_pred = model.predict(test_one_hot)
model_pred_trinary=[]
for i in model_pred:
#     if max(i)
    lab = np.where(i ==max(i))[0][0]

    if lab==0:
        model_pred_trinary.append(-1)
    elif lab==1:
        model_pred_trinary.append(1)
    else:
        model_pred_trinary.append(0)
model_pred_trinary[100:300]

[0,
 0,
 0,
 -1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 -1,
 0,
 0,
 0,
 1,
 -1,
 0,
 -1,
 -1,
 0,
 -1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 -1,
 -1,
 1,
 0,
 1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 -1,
 -1,
 0,
 0,
 1,
 0,
 0,
 1,
 -1,
 1,
 1,
 0,
 -1,
 0,
 -1,
 -1,
 1,
 -1,
 -1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 -1,
 0,
 -1,
 1,
 0,
 -1,
 -1,
 1,
 -1,
 0,
 -1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 -1,
 1,
 0,
 -1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 -1,
 1,
 0,
 1,
 0,
 1,
 0,
 -1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 -1,
 1,
 0,
 1,
 1,
 1,
 1,
 -1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 -1,
 0,
 1,
 0,
 1,
 0,
 0,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 0,
 -1,
 0,
 1,
 1,
 0,
 0,
 -1,
 1,
 0,
 0,
 -1,
 0,
 1,
 1,
 0,
 1,
 0,
 -1,
 0,
 0,
 1,
 0,
 1,
 -1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 0,
 -1,
 0,
 0,
 0,
 0,
 -1,
 0]

In [53]:
confusion_matrix(test_y, model_pred_trinary, labels=[-1,1,0])

array([[ 41,  47, 118],
       [ 32, 129,  51],
       [ 58,  56,  37]])

### Memory Networks

https://arxiv.org/pdf/1410.3916.pdf

I watched the movie. It was awful.

----

Jack is in the kitchen.

Jack went to bathroom.

Where is Jack? -> bathroom

Jack came back. 

Where is Jack? -> kitchen

------

Joe went to the kitchen.

Fred went to the kitchen. 

Joe picked up the milk. 

Joe travelled to the office. 

Joe left the milk. 

Joe went to the bathroom. 

Where is the milk now? A: office

Where is Joe? A: bathroom

Where was Joe before the office? A: kitchen