In [1]:
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem import PorterStemmer
from autocorrect import Speller

import os
import re

In [2]:
import tensorflow as tf
from tensorboard.plugins import projector
sconfig = projector.ProjectorConfig()

In [32]:
MAX_LEN  = 25
BATCH_SIZE = 64
NUM_EPOCHS = 10000

spell = Speller(lang='en')

stemmer = PorterStemmer()
def process_str(string, bot_input=False, bot_output=False):
    string = string.strip().lower()
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`:]", " ", string)
    string = re.sub(r"\'s", " \'s",string)
    string = re.sub(r"\'ve", " \'ve",string)
    string = re.sub(r"n\'t", " n\'t",string)
    string = re.sub(r"\'re", " \'re",string)
    string = re.sub(r"\'d", " \'d",string)
    string = re.sub(r"\'ll", " \'ll",string)
    string = re.sub(r",", " , ",string)
    string = re.sub(r"!", " ! ",string)
    string = re.sub(r"\s{2,}", " ",string)
    string = string.split(" ")
    string = [re.sub(r"[0-9]+", "NUM", token) for token in string]
    string = [stemmer.stem(re.sub(r'(.)\1+',r'\1\1',token)) for token in string]
    string = [spell.autocorrect_word(token).lower() for token in string]
    
    while True:
        try:
            string.remove("")
        except:
            break
    if(not bot_input and not bot_output):
        string = string[0:MAX_LEN]
    elif(bot_input):
        string = string[0:MAX_LEN-1]
        string.insert(0,"</start>")
    else:
        string = string[0:MAX_LEN-1]
        string.insert(len(string),"</end>")
    old_len = len(string)
    for i in range((MAX_LEN) - len(string)):
        string.append(" </pad> ")
    string = re.sub("\s+", " ", " ".join(string)).strip()
    return string, old_len


### LOAD DATA

In [33]:
data = cPickle.load(open("all_convos.pkl","rb"))

In [34]:
len(data)

10407

In [35]:
user = [item[0] for item in data]
bot = [item[1] for item in data]

In [36]:
user[:5]

["I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
 'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.',
 'I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?',
 "I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help",
 'Hello, I am looking to book a vacation from Gotham City to Mos Eisley for $2100.']

In [37]:
bot[:5]

['Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?',
 'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?',
 'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?',
 '',
 "Hi. Sorry, I can't find any trips from Gotham City to Mos Eisley for you."]

### Preprocessing

In [38]:
if(os.path.isfile("user_processed.pkl")):
    user = cPickle.load(open("user_processed.pkl", "rb"))
else:
    user = [process_str(item) for item in user]
    cPickle.dump(user, open("user_processed.pkl", "wb"))

if(os.path.isfile("bot_in_processed.pkl")):
    bot_inputs = cPickle.load(open("bot_in_processed.pkl", "rb"))
else:
    bot_inputs = [process_str(item, bot_input=True) for item in bot]
    cPickle.dump(bot_inputs, open("bot_in_processed.pkl", "wb"))

if(os.path.isfile("bot_out_processed.pkl")):
    bot_outputs = cPickle.load(open("bot_out_processed.pkl", "rb"))
else:
    bot_outputs = [process_str(item, bot_output=True) for item in bot]
    cPickle.dump(bot_outputs, open("bot_out_processed.pkl", "wb"))
    
    
user_lens = [message[1] for message in user]
user = [message[0] for message in user]

bot_inp_lens = [message[1] for message in bot_inputs]
bot_out_lens = [message[1] for message in bot_outputs]

bot_inputs = [message[0] for message in bot_inputs]
bot_outputs = [message[0] for message in bot_outputs]

KeyboardInterrupt: 