## CREATING QA BOT WITH PYTHON

In [3]:
import pickle
import numpy as np
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate, LSTM

Using TensorFlow backend.


### Load Data

In [4]:
#Loading training data
with open('train_qa.txt','rb') as f:
    train_data = pickle.load(f)

In [5]:
#Loading test data
with open('test_qa.txt','rb') as f:
    test_data = pickle.load(f)

In [12]:
len(train_data), len(test_data)

(10000, 1000)

In [13]:
train_data[0]

(['Mary',
  'moved',
  'to',
  'the',
  'bathroom',
  '.',
  'Sandra',
  'journeyed',
  'to',
  'the',
  'bedroom',
  '.'],
 ['Is', 'Sandra', 'in', 'the', 'hallway', '?'],
 'no')

#### Train data contains 10, 000 tuples. Each tuple holds a list comprehension of the story, question, and  a "yes"/"no" answer.

In [15]:
# Story
" ".join(train_data[0][0])

'Mary moved to the bathroom . Sandra journeyed to the bedroom .'

In [16]:
# Question
" ".join(train_data[0][1])

'Is Sandra in the hallway ?'

In [19]:
# Answer
train_data[0][2]

'no'

### Data Preparation

In [20]:
'''
Combine train and test data 
'''
all_data = train_data + test_data

In [21]:
def vocab_creator(data):
    '''
    Creating vocabulary of words present in our data
    '''
    vocab = set()
    for story, question, answer in data:
        vocab = vocab.union(set(story))
        vocab = vocab.union(set(question))
    vocab.add('yes')
    vocab.add('no')
    
    tokenizer = Tokenizer(filters = [])
    tokenizer.fit_on_texts(vocab)
    
    max_story_len = max([len(datum[0]) for datum in data]) #maximum story length
    max_question_len = max([len(datum[1]) for datum in data]) #maximum question length
    
    return tokenizer, max_story_len, max_question_len