# Load Data

In [1]:
import pandas as pd
import numpy as np

In [4]:
headlines = [] #Stores all responses 
headlines = pd.read_csv("D:/Academics/BITS/Projects/Conversational Agents Project - Dr. Manik Gupta/empatheticdialogues/Short.csv", usecols=['utterance']) 
headlines = headlines.values #Removes header 'utterance'

In [5]:
len(headlines)

900

In [7]:
headlines;

# Data Cleaning

In [8]:
import string
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


def clean_text(headline):
    text = "".join( word for word in headline if word not in string.punctuation ).lower()
    text = text.encode("utf8").decode("ascii", "ignore")
    return text
corpus = [ clean_text(headline) for headline in headlines ]


In [9]:
len(corpus)

900

# Generate seq n-gram Tokens

In [10]:
vocab = []
for line in corpus:
    words = line.split()
    for word in words:
        vocab.append(word)

vocabraly = set(vocab) #Unique words

In [11]:
len(vocabraly)

2593

In [12]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(200)
tokenizer.fit_on_texts(corpus)
word2index = tokenizer.word_index #Assigns a number to each word 
len(word2index)

1874

In [13]:
dictionary = {}
rev_dictionary = {}
for word, idx in word2index.items():
    if idx > 1406:
        continue
    dictionary[word] = idx
    rev_dictionary[idx] = word

In [14]:
max(rev_dictionary.keys())

1406

In [15]:
input_seqences = tokenizer.texts_to_sequences(corpus)

In [16]:
len(input_seqences)

900

# Padding the seqs and obtaining variables

In [19]:
input_data = []
target = []
#every_hundredth_element = input_seqences[::200] #Picks every 200th item from input_seqences
for line in input_seqences:
    for i in range(1, len(line)-1):
        input_data.append(line[:i])
        target.append(line[i+1])

In [20]:
input_data[:5]

[[1], [1, 63], [1, 63, 2], [1, 63, 2, 75], [1, 63, 2, 75, 4]]

In [21]:
target[:5]

[2, 75, 4, 29, 9]

In [22]:
MAX_LEN = 0
for seq in input_data:
    if len(seq) > MAX_LEN:
        MAX_LEN = len(seq)
MAX_LEN

30

In [23]:
from keras.preprocessing.sequence import pad_sequences
input_data = pad_sequences(input_data, maxlen=MAX_LEN, padding="post", truncating="post")
len(input_data[0])

30

In [24]:
input_data.shape

(7014, 30)

In [25]:
VOCAB_SIZE = 2001
VOCAB_SIZE

2001

In [26]:
MAX_LEN

30

In [27]:
input_data = np.array(input_data)
target = np.array(target)

# LSTMs for Text Generation

1. Input Layer : Takes the sequence of words as input
2. LSTM Layer : Computes the output using LSTM units. I have added 100 units in the layer, but this number can be fine tuned later.
3. Dropout Layer : A regularisation layer which randomly turns-off the activations of some neurons in the LSTM layer.
4. Output Layer : Computes the probability of the best possible next word as output

In [28]:
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Dropout
from keras.callbacks import EarlyStopping

In [37]:
from keras.layers import LSTM
model = Sequential()

#model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=100, input_length=MAX_LEN))
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=1000, input_length=MAX_LEN))


#model.add(LSTM(units=100))
model.add(LSTM(units=1000))
model.add(Dropout(rate=0.1))

#model.add(Dense(units=target.shape[1], activation="softmax"))
#model.add(Dense(100, activation="softmax"))
model.add(Dense(1000, activation="softmax"))

In [38]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [39]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 1000)          2001000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
Total params: 11,006,000
Trainable params: 11,006,000
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.fit(input_data, target, batch_size=32, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x214000db508>

# GRU for Text Generation

In [48]:
gru_model = Sequential()
gru_model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=1000, input_length=MAX_LEN))
gru_model.add(GRU(units=1000))
gru_model.add(Dropout(rate=0.1))
#gru_model.add(Dense(units=target.shape[1], activation="softmax"))
gru_model.add(Dense(1000, activation="softmax"))

In [49]:
#gru_model.compile(loss="categorical_crossentropy", optimizer="adam")
gru_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [50]:
gru_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 30, 1000)          2001000   
_________________________________________________________________
gru_1 (GRU)                  (None, 1000)              6006000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1000)              1001000   
Total params: 9,008,000
Trainable params: 9,008,000
Non-trainable params: 0
_________________________________________________________________


In [51]:
gru_model.fit(input_data, target, batch_size=32, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x214011eafc8>

# Text Generation

In [52]:
import tensorflow as tf
tf.random.set_seed(2) 

In [55]:
def text_generater(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding="post")
        #predicted = model.predict_classes(token_list, verbose=0)
        predicted = model.predict(token_list)
        predicted = np.argmax(predicted, axis=1)
        
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
text1 = "Its sad that"
text2 = "I have never cheated"
text_generater(text1, 5, model, MAX_LEN)
text_generater(text2, 3, model, MAX_LEN)