
# RNN using LSTM 
       




<img src="img/RNN-rolled.png"/ width="80px" height="80px">

<img src="img/RNN-unrolled.png"/ width="400px" height="400px">

<img src="img/LSTM3-chain.png"/ width="800px" height="800px">

_source: http://colah.github.io/posts/2015-08-Understanding-LSTMs_

In [1]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot,text_to_word_sequence,base_filter
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence

Using Theano backend.


In [2]:
from sklearn.cross_validation import train_test_split

In [3]:
import os
import pickle
import numpy as np

In [4]:
import pandas as pd

In [5]:
DATA_DIRECTORY = os.path.join('data')
print DATA_DIRECTORY

data


In [33]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
    male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
    female_posts = pickle.load(female_file)

In [54]:
filtered_male_posts = []
filtered_female_posts = []

for post_male in male_posts:
    if len(post_male) == 0:
        continue
    filtered_male_posts.append(post_male)

for post_female in female_posts:
    if len(post_female) == 0:
        continue
    filtered_female_posts.append(post_female)

In [55]:
# text processing - one hot builds index of the words
male_one_hot = []
female_one_hot = []
n = 30000
print(len(filtered_female_posts))
print(len(filtered_male_posts))

for post in filtered_male_posts:
    try:
        male_one_hot.append(one_hot(post,n,split=" ",filters=base_filter(),lower=True))
    except Exception as e:
        continue

for post in filtered_female_posts:
    try:
        female_one_hot.append(one_hot(post,n,split=" ",filters=base_filter(),lower=True))
    except:
        continue

2247
2595


In [56]:
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(male_one_hot)),np.ones(len(female_one_hot))))

In [57]:
x_train_rnn,x_test_rnn,y_train_rnn,y_test_rnn = train_test_split(np.concatenate((male_one_hot,female_one_hot)),concatenate_array_rnn,test_size=0.2)

In [60]:
maxlen = 100
x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)

#print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
#print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

In [61]:
maxlen = 100
x_rnn = sequence.pad_sequences(x_rnn,maxlen=maxlen)

print('x_train_rnn shape:', x_rnn.shape,y_rnn.shape)

('x_train_rnn shape:', (4633, 100), (4633,))


In [62]:
max_features = 30000
dimension = 64
input_dimension = 64
output_dimension = 64


In [63]:
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(output_dim = output_dimension, input_dim=input_dimension, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(input_dim=output_dimension, output_dim=1))

model.add(Activation('sigmoid'))

In [65]:
model.compile(loss='mean_squared_error',optimizer='sgd', metrics=["accuracy"])

In [66]:
model.fit(x_train_rnn,y_train_rnn,
          batch_size=32,nb_epoch=10,
          validation_data=(x_test_rnn, y_test_rnn),
          verbose=1)

Train on 3706 samples, validate on 927 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe917d002d0>

In [44]:
model.evaluate(x_test_rnn,y_test_rnn,batch_size=32)



[0.25045589757068515, 0.58036677396284597]

In [45]:
x_train_rnn,x_test_rnn,y_train_rnn,y_test_rnn = train_test_split(np.concatenate((male_one_hot, female_one_hot)),y_rnn,test_size=0.2)

In [46]:
maxlen = 100
x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)

print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

('x_train_rnn shape:', (3706, 100), (3706,))
('x_test_rnn shape:', (927, 100), (927,))


In [47]:
predicted_output = model.predict(x_test_rnn,batch_size=32)
predicted_classes = model.predict_classes(x_test_rnn, batch_size=32)



In [48]:
df = pd.DataFrame(columns=['predicted','actual'])

In [49]:
df['predicted_class'] = predicted_classes.flatten()
df['predicted'] = predicted_output.flatten()

In [50]:
df['actual'] = y_test_rnn

In [51]:
df.predicted_class.value_counts()

0    927
Name: predicted_class, dtype: int64

In [35]:
df.actual.value_counts()

0    509
1    418
Name: actual, dtype: int64

In [48]:
x_train_rnn.shape

(3706, 100)

# Using TFIDF Vectorizer as an input instead of one hot

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [50]:
vectorizer = TfidfVectorizer(decode_error='ignore', norm='l2')
tfidf_male = vectorizer.fit_transform(filtered_male_posts)
tfidf_female = vectorizer.fit_transform(filtered_female_posts)

In [51]:
flattened_array_tfidf_male = tfidf_male.toarray()
flattened_array_tfidf_female = tfidf_male.toarray()

In [52]:
concatenate_array_rnn = np.concatenate((np.zeros(len(flattened_array_tfidf_male)),np.ones(len(flattened_array_tfidf_female))))

In [53]:
x_train_rnn,x_test_rnn,y_train_rnn,y_test_rnn = train_test_split(np.concatenate((flattened_array_tfidf_male,flattened_array_tfidf_female)),concatenate_array_rnn,test_size=0.2)

In [54]:
maxlen = 100
# x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
# x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)
# print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
# print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

In [75]:
maxlen = 100
x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)
print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

('x_train_rnn shape:', (4152, 100), (4152,))
('x_test_rnn shape:', (1038, 100), (1038,))


In [76]:
max_features = 30000
dimension = 128
input_dimension = 128
output_dimension = 128

In [77]:
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(output_dim = output_dimension, input_dim=input_dimension, return_sequences=False, activation='sigmoid'))
model.add(Dropout(0.5))
#model.add(LSTM(output_dim=output_dimension))
model.add(Dropout(0.5))
model.add(Dense(input_dim=output_dimension, output_dim=1))
model.add(Activation('sigmoid'))

In [78]:
model.compile(loss='mean_squared_error',optimizer='sgd')

In [79]:
model.fit(x_train_rnn,y_train_rnn,batch_size=32,nb_epoch=10,validation_data=(x_test_rnn,y_test_rnn),show_accuracy=True)

Train on 4152 samples, validate on 1038 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fcaf91d8750>

In [60]:
model.evaluate(x_test_rnn,y_test_rnn,batch_size=32,show_accuracy=True)



`model.compile(optimizer, loss, metrics=["accuracy"])`


0.24999907024219087

In [61]:
df = pd.DataFrame(columns=['predicted','actual'])

In [62]:
predicted_output = model.predict(x_test_rnn,batch_size=32)

In [64]:
df['pc'] = model.predict_classes(x_test_rnn, batch_size=32).flatten()



In [65]:
df['predicted'] = predicted_output.flatten()
df['actual'] = y_test_rnn

In [66]:
df

Unnamed: 0,predicted,actual,pc
0,0.499085,0,0
1,0.499085,0,0
2,0.499085,0,0
3,0.499085,0,0
4,0.499085,0,0
5,0.499085,0,0
6,0.499085,1,0
7,0.499085,0,0
8,0.499085,0,0
9,0.499085,1,0


In [68]:
df.pc.value_counts()

0    1038
Name: pc, dtype: int64

In [53]:
y_test_rnn.shape

(1038,)

In [54]:
df.shape

(1038, 2)

In [58]:
df.head()

Unnamed: 0,predicted,actual,pc
0,0.495028,1,0
1,0.495028,0,0
2,0.495028,1,0
3,0.495028,1,0
4,0.495028,1,0


# Sentence Generation using RNN(LSTM)

In [None]:
# reading all the male text data into one string
male_post = ' '.join(filtered_male_posts[:2])

#building character set for the male posts
character_set_male = set(male_post)
#building two indices - character index and index of character
char_indices = dict((c, i) for i, c in enumerate(character_set_male))
indices_char = dict((i, c) for i, c in enumerate(character_set_male))


# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 1
sentences = []
next_chars = []
for i in range(0, len(male_post) - maxlen, step):
    sentences.append(male_post[i : i + maxlen])
    next_chars.append(male_post[i + maxlen])


In [None]:
#Vectorisation of input
x_male = np.zeros((len(male_post),maxlen,len(character_set_male)),dtype=np.bool)
y_male = np.zeros((len(male_post),len(character_set_male)),dtype=np.bool)

print x_male.shape,y_male.shape

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x_male[i, t, char_indices[char]] = 1
    y_male[i, char_indices[next_chars[i]]] = 1

print x_male.shape,y_male.shape

In [None]:

#Building the model to generate text with 2 layers
auto_text_generating_male_model = Sequential()
auto_text_generating_male_model.add(LSTM(len(character_set_male),512,return_sequences=True))
auto_text_generating_male_model.add(Dropout(0.2))
auto_text_generating_male_model.add(LSTM(512,512,return_sequences=False))
auto_text_generating_male_model.add(Dropout(0.2))
auto_text_generating_male_model.add(Dense(512,len(character_set_male)))
auto_text_generating_male_model.add(Activation('sigmoid'))

In [None]:
auto_text_generating_male_model.compile(loss='mean_squared_error',optimizer='sgd')

In [None]:
import random,sys

In [None]:
# helper function to sample an index from a probability array
def sample(a, diversity=0.75):
    if random.random() > diversity:
        return np.argmax(a)
    while 1:
        i = random.randint(0, len(a)-1)
        if a[i] > random.random():
            return i

In [None]:
# train the model, output generated text after each iteration
for iteration in range(1,10):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    auto_text_generating_male_model.fit(x_male, y_male, batch_size=128, nb_epoch=1)

    start_index = random.randint(0, len(male_post) - maxlen - 1)

    for diversity in [0.2, 0.4, 0.6, 0.8]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = male_post[start_index : start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')

        for iteration in range(400):
            try:
                x = np.zeros((1, maxlen, len(character_set_male)))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                preds = auto_text_generating_male_model.predict(x, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                #sys.stdout.write(next_char)
                #sys.stdout.flush()
            except:
                continue
                
        print sentence
        print()