In [None]:
import json
import pandas as pd
import numpy as np
import pickle
import h5py
from collections import Counter
import nltk
#nltk.download('punkt') if needed
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM, Flatten, Embedding, Merge
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import base_filter
from keras.utils import np_utils
from keras.layers import Embedding
import scipy as sc

In [None]:
train_question_file = ''
train_answer_file = ''
val_question_file = ''
val_answer_file = ''

# column for question -> 'questions'
#            val      -> 'annotations'

def json_pandas(json_file,column):
    with open(json_file) as f:
        json = json.loads(f.read())

    df = pd.DataFrame(json[column])
    del json
    
    return df

train_question = json_pandas(train_question_file,'questions')
train_answer = json_pandas(train_answer_file,'annotations')
test_question = json_pandas(val_question_file,'questions')
test_answer = json_pandas(val_answer_file,'annotations')

In [None]:
def clean_raw_data(ques_df,ans_df):
    ques = ques_df.ix[:,[0,2,3]]
    ques['answer']=ans_df.ix[:,3]
    df = ques.drop(['multiple_choices','question_id'],axis=1)
    del ques_df,ans_df,ques
    return df

train = clean_raw_data(train_question,train_answer)
test = clean_raw_data(train_question,train_answer)

In [None]:
# make pickle or h5py
#train.to_pickle(file_name)
#test.to_pickle(file_name)
#or
#train.to_hdf(file_name)
#test.to_hdf(file_name)

In [None]:
max_word = 1000
max_seq = 20

def dataframe_list(df):
    ques_list = [s.encode('ascii') for s in list(df.question.values)]
    ans_list = [s.encode('ascii') for s in list(df.answer.values)]
    image_list = df.image_id.values.tolist()
    return ques_list,ans_list,image_list

train_q,train_a,train_img = dataframe_list(train)
test_q,test_a,test_img = dataframe_list(test)

def tokeniz(txt,mode='default',max_word_size=None):
    
    if mode is 'default':
        tokenizer = Tokenizer(nb_words=None, filters=base_filter(), lower=True, split=" ")
        tokenizer.fit_on_texts(txt)
        sequences = tokenizer.texts_to_sequences(txt)
        
    elif mode is 'question':
        tokenizer = Tokenizer(nb_words=max_word_size, filters=base_filter(), lower=True, split=" ")
        tokenizer.fit_on_texts(txt)
        sequences = tokenizer.texts_to_sequences(txt)
        word_index = tokenizer.word_index
        data = pad_sequences(sequences, maxlen=max_seq)
        return data
    
    elif mode is 'answer':
        tokenizer = Tokenizer(nb_words=None, filters=base_filter(), lower=True, split=" ")
        tokenizer.fit_on_texts(txt)
        sequences = tokenizer.texts_to_sequences(txt)
        word_index = tokenizer.word_index
        sort_freq = sorted(word_index.items(),key=operator.itemgetter(1),reverse=True)[0:max_word]
        top_answers, top_fq = zip(*sort_freq)
        labels = np_utils.to_categorical(np.asarray(top_fq))
        data = np.zeros(shape(len(txt),max_word))
        for i in range(len(txt)):
            data[i]=labels[top_answer.index(txt[i])]
        return labels

X_train = tokeniz(train_q,'question')
Y_train = tokeniz(train_a,'answer')
X_test = tokeniz(test_q,'question')
Y_test = tokeniz(test_a,'answer')

In [None]:
def get_image_labels(pickle_file_path):
    data_frame = pd.read_pickle(pickle_file_path)
    labels = data_frame[['image_id']].values
    return labels

def get_image_features(img_ids,vgg_model_path):
    features_struct = sc.io.loadmat(vgg_model_path)
    VGGfeatures = features_struct['feats']
    id_map = {}
    for ids in img_ids:
        ids_split = ids.split()
        id_map[id_split[0]] = int(id_split[1])
    nb_samples = len(img_ids)
    nb_dimensions = VGGfeatures.shape[0]
    image_matrix = np.zeros((nb_samples, nb_dimensions))
    for j in range(nb_samples):
        image_matrix[j,:] = VGGfeatures[:,id_map[img_ids[j]]]
    return image_matrix

X_train_img = get_image_features(train_img)
X_test_img = get_image_features(test_img)

In [None]:
embed_dim = 300

with open('embeddings/embedding_matrix','r') as f:
    embedding = pickle.load(file)
with open('embeddings/word_idx','r') as f:
    word_idx = pickle.load(file)

embedding_matrix = np.zeros(shape=(len(word_index)+1), embed_dim)
for word,freq in word_index.items():
    embedding_matrix[freq] = embedding[word_idx[word]]

# Model

In [None]:
left_vgg = Sequential()
left_vgg.add(Dense(300, input_dim=4096, activation='relu'))

centre_w2vec = Sequential()
embedding_layer = Embedding(len(word_index) + 1,embed_dim,weights=[embedding_matrix],input_length=max_seq,trainable=False)
#centre_w2vec.add(Dense(500,input_dim=300))

right_vgg = Sequential()
right_vgg.add(Dense(300, input_dim=4096, activation='relu'))

merge_layer = Merge([left_vgg,centre_w2vec,right_vgg], mode='concat')

lstm_model = Sequential()
lstm_model.add(merge_layer)
lstm_model.add(Dropout(dropout_rate))
lstm_model.add(LSTM(1000, input_shape=(1+max_seq+1,300)))
lstm_model.add(Dense(1000, activation='softmax'))

lstm_model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
lstm_model.fit([X_train_img, X_train, X_train_img], Y_train)