
# RNN using LSTM 
       




<img src="img/RNN-rolled.png"/ width="80px" height="80px">

<img src="img/RNN-unrolled.png"/ width="400px" height="400px">

<img src="img/LSTM3-chain.png"/ width="800px" height="800px">

_source: http://colah.github.io/posts/2015-08-Understanding-LSTMs_

In [None]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot,text_to_word_sequence,base_filter
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence

In [None]:
filtered_male_posts = []
filtered_female_posts = []

for post_male in male_posts:
    if len(post_male) == 0:
        continue
    filtered_male_posts.append(post_male)

for post_female in female_posts:
    if len(post_female) == 0:
        continue
    filtered_female_posts.append(post_female)

In [None]:
# text processing - one hot builds index of the words
male_one_hot = []
female_one_hot = []
n = 30000
for post in filtered_male_posts:
    try:
        male_one_hot.append(one_hot(post,n,split=" ",filters=base_filter(),lower=True))
    except Exception as e:
        continue

for post in filtered_female_posts:
    try:
        female_one_hot.append(one_hot(post,n,split=" ",filters=base_filter(),lower=True))
    except:
        continue

In [None]:
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(male_one_hot)),np.ones(len(female_one_hot))))

In [None]:
x_train_rnn,x_test_rnn,y_train_rnn,y_test_rnn = train_test_split(np.concatenate((female_one_hot,male_one_hot)),concatenate_array_rnn,test_size=0.2)

In [None]:
maxlen = 100
x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)
print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

In [None]:
max_features = 30000
dimension = 128
input_dimension = 128
output_dimension = 128
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(input_dimension, output_dimension))
model.add(Dropout(0.5))
model.add(Dense(128, 1))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='mean_squared_error',optimizer='sgd')

In [None]:
model.fit(x_train_rnn,y_train_rnn,batch_size=32,nb_epoch=4,validation_data=(x_test_rnn,y_test_rnn),show_accuracy=True)

In [None]:
score,acc = model.evaluate(x_test_rnn,y_test_rnn,batch_size=32,show_accuracy=True)

# Using TFIDF Vectorizer as an input instead of one hot

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
vectorizer = TfidfVectorizer(decode_error='ignore', norm='l2')
tfidf_male = vectorizer.fit_transform(clean_male_post_list)
tfidf_female = vectorizer.fit_transform(clean_female_post_list)

In [None]:
flattened_array_tfidf_male = tfidf_male.toarray()
flattened_array_tfidf_female = tfidf_male.toarray()

In [None]:
concatenate_array_rnn = np.concatenate((np.zeros(len(flattened_array_tfidf_male)),np.ones(len(flattened_array_tfidf_female))))

In [None]:
x_train_rnn,x_test_rnn,y_train_rnn,y_test_rnn = train_test_split(np.concatenate((flattened_array_tfidf_male,flattened_array_tfidf_female)),concatenate_array_rnn,test_size=0.2)

In [None]:
maxlen = 100
# x_train_rnn = sequence.pad_sequences(x_train_rnn,maxlen=maxlen)
# x_test_rnn = sequence.pad_sequences(x_test_rnn,maxlen=maxlen)
# print('x_train_rnn shape:', x_train_rnn.shape,y_train_rnn.shape)
# print('x_test_rnn shape:', x_test_rnn.shape,y_test_rnn.shape)

In [None]:
max_features = 30000
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, 128))
model.add(Dropout(0.5))
model.add(Dense(128, 1))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='mean_squared_error',optimizer='sgd')

In [None]:
model.fit(x_train_rnn,y_train_rnn,batch_size=32,nb_epoch=4,validation_data=(x_test_rnn,y_test_rnn),show_accuracy=True)

In [None]:
score,acc = model.evaluate(x_test_rnn,y_test_rnn,batch_size=32,show_accuracy=True)

# Sentence Generation using RNN(LSTM)

In [None]:
# reading all the male text data into one string
male_post = ' '.join(filtered_male_posts[:2])

#building character set for the male posts
character_set_male = set(male_post)
#building two indices - character index and index of character
char_indices = dict((c, i) for i, c in enumerate(character_set_male))
indices_char = dict((i, c) for i, c in enumerate(character_set_male))


# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 1
sentences = []
next_chars = []
for i in range(0, len(male_post) - maxlen, step):
    sentences.append(male_post[i : i + maxlen])
    next_chars.append(male_post[i + maxlen])


In [None]:
#Vectorisation of input
x_male = np.zeros((len(male_post),maxlen,len(character_set_male)),dtype=np.bool)
y_male = np.zeros((len(male_post),len(character_set_male)),dtype=np.bool)

print x_male.shape,y_male.shape

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x_male[i, t, char_indices[char]] = 1
    y_male[i, char_indices[next_chars[i]]] = 1

print x_male.shape,y_male.shape

In [None]:

#Building the model to generate text with 2 layers
auto_text_generating_male_model = Sequential()
auto_text_generating_male_model.add(LSTM(len(character_set_male),512,return_sequences=True))
auto_text_generating_male_model.add(Dropout(0.2))
auto_text_generating_male_model.add(LSTM(512,512,return_sequences=False))
auto_text_generating_male_model.add(Dropout(0.2))
auto_text_generating_male_model.add(Dense(512,len(character_set_male)))
auto_text_generating_male_model.add(Activation('sigmoid'))

In [None]:
auto_text_generating_male_model.compile(loss='mean_squared_error',optimizer='sgd')

In [None]:
import random,sys

In [None]:
# helper function to sample an index from a probability array
def sample(a, diversity=0.75):
    if random.random() > diversity:
        return np.argmax(a)
    while 1:
        i = random.randint(0, len(a)-1)
        if a[i] > random.random():
            return i

In [None]:
# train the model, output generated text after each iteration
for iteration in range(1,10):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    auto_text_generating_male_model.fit(x_male, y_male, batch_size=128, nb_epoch=1)

    start_index = random.randint(0, len(male_post) - maxlen - 1)

    for diversity in [0.2, 0.4, 0.6, 0.8]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = male_post[start_index : start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')

        for iteration in range(400):
            try:
                x = np.zeros((1, maxlen, len(character_set_male)))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                preds = auto_text_generating_male_model.predict(x, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

                #sys.stdout.write(next_char)
                #sys.stdout.flush()
            except:
                continue
                
        print sentence
        print()