# Importing Libraries

In [1]:
import numpy as np
import tensorflow as tf
import re
import time

# DATA PREPROCESSING

#  IMPORTING THE DATASET

In [2]:
lines = open('E:\Data\cornell movie-dialogs corpus\movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')
conversations = open('E:\Data\cornell movie-dialogs corpus\movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

# DICTIONARY OF LINES

In [3]:
id2line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line)==5:
        id2line[_line[0]] = _line[4]

In [4]:
conversations

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L367', 'L368']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L401', 'L402', 'L403']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L404', 'L405', 'L406', 'L407']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L575', 'L576']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L577', 'L578']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L662', 'L663']",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L693', 'L69

# CREATING A LIST OF CONVERSATIONS

In [5]:
conversation_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    conversation_ids.append(_conversation.split(','))

# GET SEPREATLEY THE QUESTION AND ANSWERS

In [6]:
questions = []
answers = []
for conversation in conversation_ids:
    for i in range(len(conversation) - 1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [7]:
questions

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "You're asking me out.  That's so cute. What's your name again?",
 "No, no, it's my fault -- we didn't have a proper introduction ---",
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Why?',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 'Gosh, if only we could find Kat a boyfriend...',
 "C'esc ma tete. This is my head",
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have n

In [8]:
answers

["Well, I thought we'd start with pronunciation, if that's okay with you.",
 'Not the hacking and gagging and spitting part.  Please.',
 "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?",
 'Forget it.',
 'Cameron.',
 "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
 'Seems like she could get a date easy enough...',
 'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
 "That's a shame.",
 'Let me see what I can do.',
 "Right.  See?  You're ready for the quiz.",
 "I don't want to know how to say that though.  I want to know useful things. Like where the good stores are.  How much does champagne cost?  Stuff like Chat.  I have never in my life had to point out my head to someone.",
 "That's because it's such a nice one.",
 'Forget French.',
 "Well, there's someone I think might be --",
 'Where?',
 "I 

# FIRST CLEAN SECTION 

In [9]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.?,]", "", text)
    return text

# Cleaning of Questions and answers

In [10]:
 ## CLEANING OF QUESTIONS
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
    
 ## CLEANING OF ANSWERS
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [11]:
clean_questions

['can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again',
 'well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 'you are asking me out  that is so cute what is your name again',
 "no no it's my fault  we didn't have a proper introduction ",
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'why',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'gosh if only we could find kat a boyfriend',
 "c'esc ma tete this is my head",
 'right  see  you are ready for the quiz',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out

In [12]:
clean_answers

['well i thought we would start with pronunciation if that is okay with you',
 'not the hacking and gagging and spitting part  please',
 "okay then how 'bout we try out some french cuisine  saturday  night",
 'forget it',
 'cameron',
 'the thing is cameron  i am at the mercy of a particularly hideous breed of loser  my sister  i cannot date until she does',
 'seems like she could get a date easy enough',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'that is a shame',
 'let me see what i can do',
 'right  see  you are ready for the quiz',
 "i don't want to know how to say that though  i want to know useful things like where the good stores are  how much does champagne cost  stuff like chat  i have never in my life had to point out my head to someone",
 "that is because it's such a nice one",
 'forget french',
 "well there's someone i think might be ",
 'where',
 "i counted on you to help my cause 

# Create a dictionary which map each word with how many time it occcurs

In [13]:
## For questions
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
## For Answers
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

In [14]:
word2count

{'splendor': 2,
 "mom's": 91,
 'massed': 1,
 'commuter': 2,
 'hildy!': 20,
 'behoove': 1,
 'uangryu!': 2,
 "debbie's": 15,
 'withoutyou': 1,
 'rectangles': 1,
 'grabbed': 53,
 'bonasera': 3,
 'hazardous': 9,
 'v2': 1,
 'librarian': 13,
 'insurrection': 5,
 'champagne': 115,
 'painkillers': 8,
 'insipid': 3,
 'shined': 2,
 'hostile': 62,
 '2004': 4,
 'tomorra': 5,
 'sacked': 4,
 'tickled': 10,
 'howand': 2,
 'intestate': 5,
 'mending': 1,
 '1600': 4,
 'fightand': 2,
 'skulljust': 1,
 'cryogenically': 2,
 "bureau's": 7,
 'owners!': 1,
 'continuation': 2,
 'cornfield': 2,
 'coldly': 1,
 'visions': 11,
 "duke'": 1,
 'recombination': 2,
 'candy': 89,
 'heada': 2,
 'previously': 5,
 'farmgirl': 1,
 "cecile's": 2,
 'magna': 5,
 'salutation': 2,
 'wellumher': 2,
 'exchangers': 1,
 'pisani': 1,
 'drags': 8,
 'umillionsu': 2,
 'childs!': 4,
 'described': 37,
 'regards': 19,
 'shmoe!': 1,
 'hallie': 9,
 'scotch!': 1,
 'canyou': 3,
 "building's": 8,
 'jokes': 93,
 'flashing': 11,
 'beas': 1,
 'for

# Creating two dictionary that map question word and answer word into unique integer

In [15]:
## Questions
threshold_questions = 20
questionswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_questions:
        questionswords2int[word] = word_number
        word_number += 1
## Answers
threshold_answers = 20
answerswords2int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold_answers:
        answerswords2int[word] = word_number
        word_number += 1

In [16]:
questionswords2int

{"mom's": 0,
 'weaker': 7443,
 'normally': 1165,
 'breath': 3322,
 'hildy!': 1,
 'shots': 4411,
 'leaving': 7731,
 'voice': 5188,
 'certain': 7732,
 'grabbed': 2,
 'indians': 7730,
 'introduce': 6069,
 'somewhere': 6619,
 "guy's": 4412,
 'champagne': 3,
 'e': 2261,
 'leave!': 7733,
 'shhh': 7896,
 'situations': 2277,
 'llewelyn': 7734,
 'plug': 3323,
 'merlin': 5541,
 'been': 2536,
 'committee': 4413,
 'radioactive': 4410,
 'follow': 5543,
 'cowboy': 5544,
 'search': 2264,
 'interesting': 5540,
 'anymore!': 4432,
 'doyle': 1875,
 'bare': 125,
 'said!': 1166,
 'replace': 2265,
 'shining': 2291,
 'facts': 8400,
 'sexually': 8250,
 'entertaining': 6624,
 'cab': 2266,
 'candy': 5,
 'veronica': 7330,
 'afford': 6625,
 'reject': 1167,
 'bullshit': 5546,
 'choir': 4415,
 'something!': 3324,
 'randy': 3325,
 'helps': 29,
 'hemingway': 3326,
 'championship': 978,
 'where': 3327,
 'krauts': 5722,
 'sugar': 6658,
 'described': 6,
 'brother': 2775,
 'stayed': 2269,
 "everything's": 289,
 'brass': 

# Encoding the Last tokens

In [17]:
## Questions
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
## Answers
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

# Creating Inverse Dictionary of the answerwords2int

In [18]:
answersints2word = {w_i: w for w, w_i in answerswords2int.items()}

In [19]:
answersints2word

{0: "mom's",
 1: 'hildy!',
 2: 'grabbed',
 3: 'champagne',
 4: 'hostile',
 5: 'candy',
 6: 'described',
 7: 'jokes',
 8: 'uweu',
 9: 'cocktail',
 10: 'hunger',
 11: 'effort',
 12: 'friend!',
 13: 'con',
 14: 'exact',
 15: 'woke',
 16: 'crane',
 17: 'cue',
 18: 'norm',
 19: 'leadership',
 20: 'catching',
 21: 'mission',
 22: 'security',
 23: 'eh',
 24: 'teachers',
 25: 'confront',
 26: 'need',
 27: 'machines',
 28: 'mookie',
 29: 'helps',
 30: 'convenience',
 31: 'ahh',
 32: 'lights',
 33: 'swell',
 34: 'fate',
 35: 'sold',
 36: 'method',
 37: 'colleague',
 38: 'influence',
 39: 'stand',
 40: 'kinds',
 41: 'ordered',
 42: 'maude',
 43: 'ike',
 44: 'witches',
 45: 'maintenance',
 46: 'guilt',
 47: 'pine',
 48: "startin'",
 49: 'made',
 50: 'poem',
 51: 'sox',
 52: 'duck',
 53: 'barnes',
 54: 'cruise',
 55: 'ear',
 56: 'bored',
 57: 'stone',
 58: 'flush',
 59: 'stamp',
 60: 'failed',
 61: 'supreme',
 62: 'freedom',
 63: 'nation',
 64: 'engaged',
 65: 'gestapo',
 66: 'downstairs',
 67: 'ha

# Adding EOS to end of every answers

In [20]:
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>'

# Translate all questions and answers into unique integer format

In [21]:
## Questions
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
## Answers
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)

In [22]:
questions_into_int

[[5205,
  8193,
  2120,
  2183,
  5247,
  8824,
  8824,
  1473,
  1917,
  8824,
  6671,
  6382,
  7871,
  4883,
  8824,
  1535,
  4916,
  2160,
  6269,
  3770,
  8824,
  6629],
 [2304,
  104,
  3328,
  8193,
  4829,
  5464,
  7084,
  8824,
  1013,
  4440,
  2247,
  6693,
  7084,
  2596],
 [6057, 3770, 8824, 1473, 8824, 1473, 8824, 4081, 3462],
 [2596,
  6671,
  5778,
  1750,
  2862,
  4440,
  2247,
  4877,
  1047,
  2783,
  2247,
  7584,
  7073,
  6629],
 [6915, 6915, 1989, 8605, 4214, 8193, 5782, 719, 6419, 1767, 8824],
 [4522],
 [3770,
  1837,
  2247,
  4522,
  104,
  7787,
  6859,
  3770,
  6127,
  2826,
  6419,
  3317,
  8824,
  7363,
  2826,
  5803,
  8605,
  7302,
  104,
  1161,
  3647,
  5389,
  2043,
  7835],
 [1126],
 [8824,
  3918,
  2043,
  876,
  4115,
  90,
  1509,
  3239,
  99,
  2043,
  4010,
  5956,
  8016,
  6418,
  3852,
  5569,
  3686,
  5750,
  2043,
  8232,
  3702,
  2826,
  3852,
  7718,
  7701],
 [7714, 1013, 4349, 8193, 8403, 8781, 1881, 6419, 5688],
 [8824, 221

In [23]:
answers_into_int

[[2304,
  104,
  3328,
  8193,
  4829,
  5464,
  7084,
  8824,
  1013,
  4440,
  2247,
  6693,
  7084,
  2596,
  8823],
 [6057, 3770, 8824, 1473, 8824, 1473, 8824, 4081, 3462, 8823],
 [6693, 6418, 636, 3489, 8193, 6516, 2862, 6334, 8532, 8824, 3939, 5529, 8823],
 [5797, 3852, 8823],
 [4522, 8823],
 [3770,
  1837,
  2247,
  4522,
  104,
  7787,
  6859,
  3770,
  6127,
  2826,
  6419,
  3317,
  8824,
  7363,
  2826,
  5803,
  8605,
  7302,
  104,
  1161,
  3647,
  5389,
  2043,
  7835,
  8823],
 [883, 5750, 2043, 8403, 4969, 6419, 3647, 1420, 996, 8823],
 [8824,
  3918,
  2043,
  876,
  4115,
  90,
  1509,
  3239,
  99,
  2043,
  4010,
  5956,
  8016,
  6418,
  3852,
  5569,
  3686,
  5750,
  2043,
  8232,
  3702,
  2826,
  3852,
  7718,
  7701,
  8823],
 [4440, 2247, 6419, 6486, 8823],
 [5628, 1750, 1719, 2783, 104, 5205, 6459, 8823],
 [1550, 1719, 2596, 6671, 3178, 556, 3770, 8824, 8823],
 [104,
  4713,
  1138,
  4115,
  8020,
  636,
  4115,
  7418,
  4440,
  6379,
  104,
  1138,
  411

# Sorting the question and answers

In [24]:
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

In [25]:
sorted_clean_questions

[[4522],
 [1126],
 [3838],
 [1008],
 [1446],
 [6915],
 [7033],
 [6915],
 [7226],
 [5923],
 [7119],
 [2783],
 [1126],
 [1446],
 [1126],
 [3866],
 [6388],
 [5772],
 [1693],
 [4207],
 [2783],
 [5052],
 [7226],
 [6693],
 [1126],
 [7226],
 [8824],
 [6944],
 [1634],
 [7226],
 [5869],
 [8824],
 [8824],
 [335],
 [636],
 [8824],
 [8824],
 [2783],
 [6915],
 [2783],
 [8824],
 [1008],
 [1550],
 [2783],
 [1066],
 [8824],
 [8221],
 [6915],
 [6915],
 [663],
 [8824],
 [4445],
 [6915],
 [8784],
 [99],
 [4207],
 [6915],
 [1700],
 [1914],
 [4207],
 [8431],
 [8431],
 [8431],
 [8431],
 [4207],
 [8744],
 [3375],
 [6693],
 [3178],
 [335],
 [2783],
 [8547],
 [4813],
 [8824],
 [8824],
 [5386],
 [4207],
 [8278],
 [2783],
 [2783],
 [4207],
 [4551],
 [4551],
 [4551],
 [4551],
 [4551],
 [4551],
 [6693],
 [335],
 [1509],
 [335],
 [8784],
 [636],
 [1676],
 [4551],
 [4551],
 [4551],
 [8824],
 [4551],
 [4551],
 [1509],
 [1887],
 [6725],
 [6099],
 [4207],
 [7073],
 [4207],
 [1291],
 [4207],
 [5546],
 [4207],
 [2304],
 

In [26]:
sorted_clean_answers

[[3770,
  1837,
  2247,
  4522,
  104,
  7787,
  6859,
  3770,
  6127,
  2826,
  6419,
  3317,
  8824,
  7363,
  2826,
  5803,
  8605,
  7302,
  104,
  1161,
  3647,
  5389,
  2043,
  7835,
  8823],
 [8824,
  3918,
  2043,
  876,
  4115,
  90,
  1509,
  3239,
  99,
  2043,
  4010,
  5956,
  8016,
  6418,
  3852,
  5569,
  3686,
  5750,
  2043,
  8232,
  3702,
  2826,
  3852,
  7718,
  7701,
  8823],
 [3327, 8823],
 [6874, 5750, 753, 8708, 2862, 3375, 3125, 8823],
 [2596, 6139, 2536, 2183, 236, 8823],
 [6693, 2596, 6671, 992, 26, 4115, 4737, 636, 4115, 3121, 8823],
 [7343, 4500, 8823],
 [2596, 6225, 8021, 1731, 4196, 3852, 8823],
 [7087, 8823],
 [4829, 2596, 4992, 2093, 1750, 6419, 8326, 4522, 8823],
 [8605,
  5729,
  6862,
  104,
  719,
  8232,
  6419,
  1249,
  8571,
  6859,
  3190,
  3770,
  8824,
  2011,
  8500,
  7159,
  8823],
 [5317, 8824, 556, 6419, 7017, 8823],
 [1224, 5569, 5750, 6419, 1210, 936, 8823],
 [3629,
  4440,
  104,
  3011,
  104,
  4829,
  3866,
  6459,
  5885,
  36

# BUILDING OF SEQ2SEQ MODEL

# Creating Placeholders for the input and targets

In [27]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
    targets = tf.placeholder(tf.int32, [None, None], name = 'target')
    lr = tf.placeholder(tf.float32, name = 'learning_rate')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    return inputs, targets, lr, keep_prob


# Preprocessing of targets

In [28]:
def preprocess_targets(targets, word2int, batch_size):
    left_side = tf.fill([batch_size, 1], word2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], 1)
    return preprocessed_targets

# Creating the encoder of RNN layer

In [29]:
def encoder_rnn(rnn_inputs, rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                                    cell_bw = encoder_cell,
                                                                    sequence_length = sequence_length,
                                                                    inputs = rnn_inputs,
                                                                    dtype = tf.float32)
    return encoder_state

# Decoding of training set

In [30]:
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              name = "attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                              training_decoder_function,
                                                                                                              decoder_embedded_input,
                                                                                                              sequence_length,
                                                                                                              scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)

# Decoding of test/validation set

In [31]:
def decode_test_set(encoder_state, decoder_cell, decoder_embeddings_matrix, sos_id, eos_id, maximum_length, num_words, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = "bahdanau", num_units = decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                              encoder_state[0],
                                                                              attention_keys,
                                                                              attention_values,
                                                                              attention_score_function,
                                                                              attention_construct_function,
                                                                              decoder_embeddings_matrix,
                                                                              sos_id,
                                                                              eos_id,
                                                                              maximum_length,
                                                                              num_words,
                                                                              name = "attn_dec_inf")
    test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                                test_decoder_function,
                                                                                                                scope = decoding_scope)
    return test_predictions

# Creating The Decoder rnn

In [32]:
def decoder_rnn(decoder_embedded_input, decoder_embeddings_matrix, encoder_state, num_words, sequence_length, rnn_size, num_layers, word2int, keep_prob, batch_size):
    with tf.variable_scope("decoding") as decoding_scope:
        lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
        lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
        decoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout] * num_layers)
        weights = tf.truncated_normal_initializer(stddev = 0.1)
        biases = tf.zeros_initializer()
        output_function = lambda x: tf.contrib.layers.fully_connected(x,
                                                                      num_words,
                                                                      None,
                                                                      scope = decoding_scope,
                                                                      weights_initializer = weights,
                                                                      biases_initializer = biases)
        training_predictions = decode_training_set(encoder_state,
                                                   decoder_cell,
                                                   decoder_embedded_input,
                                                   sequence_length,
                                                   decoding_scope,
                                                   output_function,
                                                   keep_prob,
                                                   batch_size)
        decoding_scope.reuse_variables()
        test_predictions = decode_test_set(encoder_state,
                                           decoder_cell,
                                           decoder_embeddings_matrix,
                                           word2int['<SOS>'],
                                           word2int['<EOS>'],
                                           sequence_length - 1,
                                           num_words,
                                           decoding_scope,
                                           output_function,
                                           keep_prob,
                                           batch_size)
    return training_predictions, test_predictions

# Building final Model

In [33]:
def seq2seq_model(inputs, targets, keep_prob, batch_size, sequence_length, answers_num_words, questions_num_words, encoder_embedding_size, decoder_embedding_size, rnn_size, num_layers, questionswords2int):
    encoder_embedded_input = tf.contrib.layers.embed_sequence(inputs,
                                                              answers_num_words + 1,
                                                              encoder_embedding_size,
                                                              initializer = tf.random_uniform_initializer(0, 1))
    encoder_state = encoder_rnn(encoder_embedded_input, rnn_size, num_layers, keep_prob, sequence_length)
    preprocessed_targets = preprocess_targets(targets, questionswords2int, batch_size)
    decoder_embeddings_matrix = tf.Variable(tf.random_uniform([questions_num_words + 1, decoder_embedding_size], 0, 1))
    decoder_embedded_input = tf.nn.embedding_lookup(decoder_embeddings_matrix, preprocessed_targets)
    training_predictions, test_predictions = decoder_rnn(decoder_embedded_input,
                                                         decoder_embeddings_matrix,
                                                         encoder_state,
                                                         questions_num_words,
                                                         sequence_length,
                                                         rnn_size,
                                                         num_layers,
                                                         questionswords2int,
                                                         keep_prob,
                                                         batch_size)
    return training_predictions, test_predictions

# Starting of HyperParameters

In [78]:
epochs = 5
batch_size = 32
rnn_size = 512
num_layers = 512
encoding_embedding_size = 512
decoding_embedding_size = 512
learning_rate = 0.01
learning_rate_decay = 0.8
min_learning_rate = 0.0001
keep_probability = 0.5

# Defining a session

In [79]:
tf.reset_default_graph()
session = tf.InteractiveSession()

# Loading model inputs

In [80]:
inputs, targets, lr, keep_prob = model_inputs()

# Setting the sequence Link

In [81]:
sequence_length = tf.placeholder_with_default(25, None, name = 'sequence_length')

# Getting the shape of the input tensor

In [82]:
input_shape = tf.shape(inputs)

# Getting the training and test predictions

In [83]:
training_predictions, test_predictions = seq2seq_model(tf.reverse(inputs, [-1]),
                                                       targets,
                                                       keep_prob,
                                                       batch_size,
                                                       sequence_length,
                                                       len(answerswords2int),
                                                       len(questionswords2int),
                                                       encoding_embedding_size,
                                                       decoding_embedding_size,
                                                       rnn_size,
                                                       num_layers,
                                                       questionswords2int)
 

# Setting  up the loss error, the optimizer and gradient clipping

In [84]:
with tf.name_scope("optimization"):
    loss_error = tf.contrib.seq2seq.sequence_loss(training_predictions,
                                                  targets,
                                                  tf.ones([input_shape[0], sequence_length]))
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients = optimizer.compute_gradients(loss_error)
    clipped_gradients = [(tf.clip_by_value(grad_tensor, -5., 5.), grad_variable) for grad_tensor, grad_variable in gradients if grad_tensor is not None]
    optimizer_gradient_clipping = optimizer.apply_gradients(clipped_gradients)

# Padding the sequences with PAD tokens

In [85]:
 def apply_padding(batch_of_sequences, word2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

# splitting the data into batches of questions and answers

In [86]:
def split_into_batches(questions, answers, batch_size):
    for batch_index in range(0, len(questions) // batch_size):
        start_index = batch_index * batch_size
        questions_in_batch = questions[start_index : start_index + batch_size]
        answers_in_batch = answers[start_index : start_index + batch_size]
        padded_questions_in_batch = np.array(apply_padding(questions_in_batch, questionswords2int))
        padded_answers_in_batch = np.array(apply_padding(answers_in_batch, answerswords2int))
        yield padded_questions_in_batch, padded_answers_in_batch

# spliting questions and answers in training and validation sets

In [87]:
training_validation_split = int(len(sorted_clean_questions) * 0.15)
training_questions = sorted_clean_questions[training_validation_split:]
training_answers = sorted_clean_answers[training_validation_split:]
validation_questions = sorted_clean_questions[:training_validation_split]
validation_answers = sorted_clean_answers[:training_validation_split]

# Training

In [88]:
batch_index_check_training_loss = 100
batch_index_check_validation_loss = ((len(training_questions)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
early_stopping_check = 0
early_stopping_stop = 1000
checkpoint = "E:\Data\cornell movie-dialogs corpus/chatbot_weights.ckpt" # For Windows users, replace this line of code by: checkpoint = "./chatbot_weights.ckpt"
session.run(tf.global_variables_initializer())
for epoch in range(1, epochs + 1):
    for batch_index, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(training_questions, training_answers, batch_size)):
        starting_time = time.time()
        _, batch_training_loss_error = session.run([optimizer_gradient_clipping, loss_error], {inputs: padded_questions_in_batch,
                                                                                               targets: padded_answers_in_batch,
                                                                                               lr: learning_rate,
                                                                                               sequence_length: padded_answers_in_batch.shape[1],
                                                                                               keep_prob: keep_probability})
        total_training_loss_error += batch_training_loss_error
        ending_time = time.time()
        batch_time = ending_time - starting_time
        if batch_index % batch_index_check_training_loss == 0:
            print('Epoch: {:>3}/{}, Batch: {:>4}/{}, Training Loss Error: {:>6.3f}, Training Time on 100 Batches: {:d} seconds'.format(epoch,
                                                                                                                                       epochs,
                                                                                                                                       batch_index,
                                                                                                                                       len(training_questions) // batch_size,
                                                                                                                                       total_training_loss_error / batch_index_check_training_loss,
                                                                                                                                       int(batch_time * batch_index_check_training_loss)))
            total_training_loss_error = 0
        if batch_index % batch_index_check_validation_loss == 0 and batch_index > 0:
            total_validation_loss_error = 0
            starting_time = time.time()
            for batch_index_validation, (padded_questions_in_batch, padded_answers_in_batch) in enumerate(split_into_batches(validation_questions, validation_answers, batch_size)):
                batch_validation_loss_error = session.run(loss_error, {inputs: padded_questions_in_batch,
                                                                       targets: padded_answers_in_batch,
                                                                       lr: learning_rate,
                                                                       sequence_length: padded_answers_in_batch.shape[1],
                                                                       keep_prob: 1})
                total_validation_loss_error += batch_validation_loss_error
            ending_time = time.time()
            batch_time = ending_time - starting_time
            average_validation_loss_error = total_validation_loss_error / (len(validation_questions) / batch_size)
            print('Validation Loss Error: {:>6.3f}, Batch Validation Time: {:d} seconds'.format(average_validation_loss_error, int(batch_time)))
            learning_rate *= learning_rate_decay
            if learning_rate < min_learning_rate:
                learning_rate = min_learning_rate
            list_validation_loss_error.append(average_validation_loss_error)
            if average_validation_loss_error <= min(list_validation_loss_error):
                print('I speak better now!!')
                early_stopping_check = 0
                saver = tf.train.Saver()
                saver.save(session, checkpoint)
            else:
                print("Sorry I do not speak better, I need to practice more.")
                early_stopping_check += 1
                if early_stopping_check == early_stopping_stop:
                    break
    if early_stopping_check == early_stopping_stop:
        print("My apologies, I cannot speak better anymore. This is the best I can do.")
        break
print("Game Over")        

Epoch:   1/1, Batch:    0/28892, Training Loss Error:  0.091, Training Time on 100 Batches: 121 seconds
Epoch:   1/1, Batch:  100/28892, Training Loss Error:  3.792, Training Time on 100 Batches: 93 seconds
Epoch:   1/1, Batch:  200/28892, Training Loss Error:  2.771, Training Time on 100 Batches: 46 seconds
Epoch:   1/1, Batch:  300/28892, Training Loss Error:  3.057, Training Time on 100 Batches: 45 seconds
Epoch:   1/1, Batch:  400/28892, Training Loss Error:  2.745, Training Time on 100 Batches: 47 seconds
Epoch:   1/1, Batch:  500/28892, Training Loss Error:  2.909, Training Time on 100 Batches: 43 seconds
Epoch:   1/1, Batch:  600/28892, Training Loss Error:  2.945, Training Time on 100 Batches: 37 seconds
Epoch:   1/1, Batch:  700/28892, Training Loss Error:  2.928, Training Time on 100 Batches: 93 seconds
Epoch:   1/1, Batch:  800/28892, Training Loss Error:  2.841, Training Time on 100 Batches: 90 seconds
Epoch:   1/1, Batch:  900/28892, Training Loss Error:  2.803, Training T

Epoch:   1/1, Batch: 8000/28892, Training Loss Error:  2.628, Training Time on 100 Batches: 21 seconds
Epoch:   1/1, Batch: 8100/28892, Training Loss Error:  2.531, Training Time on 100 Batches: 31 seconds
Epoch:   1/1, Batch: 8200/28892, Training Loss Error:  2.656, Training Time on 100 Batches: 23 seconds
Epoch:   1/1, Batch: 8300/28892, Training Loss Error:  2.655, Training Time on 100 Batches: 44 seconds
Epoch:   1/1, Batch: 8400/28892, Training Loss Error:  2.676, Training Time on 100 Batches: 26 seconds
Epoch:   1/1, Batch: 8500/28892, Training Loss Error:  2.521, Training Time on 100 Batches: 38 seconds
Epoch:   1/1, Batch: 8600/28892, Training Loss Error:  2.655, Training Time on 100 Batches: 21 seconds
Epoch:   1/1, Batch: 8700/28892, Training Loss Error:  2.636, Training Time on 100 Batches: 62 seconds
Epoch:   1/1, Batch: 8800/28892, Training Loss Error:  2.460, Training Time on 100 Batches: 55 seconds
Epoch:   1/1, Batch: 8900/28892, Training Loss Error:  2.514, Training Ti

Epoch:   1/1, Batch: 15900/28892, Training Loss Error:  2.547, Training Time on 100 Batches: 34 seconds
Epoch:   1/1, Batch: 16000/28892, Training Loss Error:  2.556, Training Time on 100 Batches: 55 seconds
Epoch:   1/1, Batch: 16100/28892, Training Loss Error:  3.492, Training Time on 100 Batches: 28 seconds
Epoch:   1/1, Batch: 16200/28892, Training Loss Error: 25.454, Training Time on 100 Batches: 58 seconds
Epoch:   1/1, Batch: 16300/28892, Training Loss Error:  4.196, Training Time on 100 Batches: 53 seconds
Epoch:   1/1, Batch: 16400/28892, Training Loss Error:  3.746, Training Time on 100 Batches: 65 seconds
Epoch:   1/1, Batch: 16500/28892, Training Loss Error:  3.760, Training Time on 100 Batches: 37 seconds
Epoch:   1/1, Batch: 16600/28892, Training Loss Error:  3.383, Training Time on 100 Batches: 56 seconds
Epoch:   1/1, Batch: 16700/28892, Training Loss Error:  3.326, Training Time on 100 Batches: 44 seconds
Epoch:   1/1, Batch: 16800/28892, Training Loss Error:  3.266, T

Epoch:   1/1, Batch: 23800/28892, Training Loss Error:  2.600, Training Time on 100 Batches: 37 seconds
Epoch:   1/1, Batch: 23900/28892, Training Loss Error:  2.677, Training Time on 100 Batches: 37 seconds
Epoch:   1/1, Batch: 24000/28892, Training Loss Error:  2.645, Training Time on 100 Batches: 36 seconds
Epoch:   1/1, Batch: 24100/28892, Training Loss Error:  2.652, Training Time on 100 Batches: 97 seconds
Epoch:   1/1, Batch: 24200/28892, Training Loss Error:  2.843, Training Time on 100 Batches: 33 seconds
Epoch:   1/1, Batch: 24300/28892, Training Loss Error:  2.814, Training Time on 100 Batches: 40 seconds
Epoch:   1/1, Batch: 24400/28892, Training Loss Error:  2.627, Training Time on 100 Batches: 60 seconds
Epoch:   1/1, Batch: 24500/28892, Training Loss Error:  2.642, Training Time on 100 Batches: 41 seconds
Epoch:   1/1, Batch: 24600/28892, Training Loss Error:  2.566, Training Time on 100 Batches: 55 seconds
Epoch:   1/1, Batch: 24700/28892, Training Loss Error:  2.625, T

In [None]:
checkpoint = "E:\Data\cornell movie-dialogs corpus/chatbot_weights.ckpt"
session = tf.InteractiveSession()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(session, checkpoint)

In [97]:
def convert_string2int(question, word2int):
    question = clean_text(question)
    return [word2int.get(word, word2int['<OUT>']) for word in question.split()]

In [None]:
while(True):
    question = input("You: ")
    if question == 'Goodbye':
        break
    question = convert_string2int(question, questionswords2int)
    question = question + [questionswords2int['<PAD>']] * (25 - len(question))
    fake_batch = np.zeros((batch_size, 25))
    fake_batch[0] = question
    predicted_answer = session.run(test_predictions, {inputs: fake_batch, keep_prob: 0.5})[0]
    answer = ''
    for i in np.argmax(predicted_answer, 1):
        if answersints2word[i] == 'i':
            token = ' I'
        elif answersints2word[i] == '<EOS>':
            token = '.'
        elif answersints2word[i] == '<OUT>':
            token = 'out'
        else:
            token = ' ' + answersints2word[i]
        answer += token
        if token == '.':
            break
    print('ChatBot: ' + answer)

You: hii
ChatBot:  mechanical cue cue cue cue lie! addressed addressed addressed addressed addressed addressed patience patience patience patience addressed addressed addressed addressed addressed patience patience patience patience
