In [1]:
import sys
import scipy as sc
import scipy.io
import pandas as pd
import numpy as np
%cd /content/drive/MyDrive/Dataset

def get_coco_features(split, types ):
    if split == 'train':
        data_path = '/content/drive/MyDrive/Project data/VQA/Training Data QA.pickle'
        if ( types == "small"):
            num_data = 40000
        elif (types == "full"):
            num_data = 82783

    elif split == 'val':
        data_path = '/content/drive/MyDrive/Project dataset/VQA/Validation Data QA.pickle'
        if (types == "small"):
            num_data = 8000
        elif (types == "full"):
            num_data = 40504
    else:
        print('Invalid split!')
        sys.exit()
  
    id_map_path = '/content/drive/MyDrive/Dataset/coco_vgg_id_map.txt'
    features_path = '/content/drive/MyDrive/Project dataset/Data/coco/vgg_feats.mat'
    img_labels = pd.read_pickle(data_path)[['image_id']].drop_duplicates().values.tolist()
    img_ids = open(id_map_path).read().splitlines()
    features_struct = sc.io.loadmat(features_path)

    id_map = {}
    for ids in img_ids:
        ids_split = ids.split()
        id_map[int(ids_split[0])] = int(ids_split[1])

    VGGfeatures = features_struct['feats']
    nb_dimensions = VGGfeatures.shape[0]
    nb_images = len(img_labels)
    image_matrix = np.zeros((nb_images,nb_dimensions))

    for i in range(nb_images):
        image_matrix[i,:] = VGGfeatures[:,id_map[img_labels[i][0]]]  
    image_matrix.astype('float32')
    return image_matrix[0:num_data]

/content/drive/MyDrive/Dataset


In [2]:
import numpy as np
import h5py
print('Loading image features ...')
small_img_features_train = get_coco_features('train', types = "small")
small_img_features_val = get_coco_features('val', types = "small")

Loading image features ...


FileNotFoundError: ignored

In [None]:
%cd /content/drive/MyDrive/VQA/Preprocessed Data

h5_feats = h5py.File('small_img_features_train.h5', 'w')
h5_feats.create_dataset('small_img_features_train', data = small_img_features_train)
h5_feats.close()

h5_feats_val = h5py.File('small_img_features_val.h5', 'w')
h5_feats_val.create_dataset('small_img_features_val', data = small_img_features_val)
h5_feats_val.close()

In [None]:
import nltk
nltk.download("punkt")

In [None]:
from scipy import io
import operator
import sys
import scipy as sc
from collections import defaultdict
from nltk import word_tokenize
import pandas as pd
import numpy as np
import pickle
import h5py
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [None]:
def get_question_tokenizer(types):
    data_path = "Training Data QA.pickle"
    data_path_val = "Validation Data QA.pickle"

    if ( types == "small"):
        num_data = 120000
        num_data_val = 24000
    elif (types == "full"):
        num_data = 248349
        num_data_val = 121512

    df = pd.read_pickle(data_path) 
    df_val = pd.read_pickle(data_path_val)
    questions = df['questions'].values.tolist()
    questions_val = df_val['questions'].values.tolist()
   
    all_question = questions + questions_val
  
    tokenizer = Tokenizer(num_words = 10000)
    tokenizer.fit_on_texts(all_question)

    word_index = tokenizer.word_index

    # Save the tokenizer, so that we can use this tokenizer whenever we need to predict any reviews.
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    #tokenising train data
    train_question_tokenized = tokenizer.texts_to_sequences(questions)      
    questions = pad_sequences(train_question_tokenized, maxlen = 25)          # len(X_train) x 25

    #tokenising validation data
    val_question_tokenized = tokenizer.texts_to_sequences(questions_val)
    questions_val = pad_sequences(val_question_tokenized, maxlen = 25)               # len(X_val) X 25 

    return questions[0:num_data], questions_val[0: num_data_val], word_index
  
def get_questions_matrix(split):
  
    if split == 'train':
        data_path = 'data_train_qa.pickle'
    elif split == 'val':
        data_path = 'data_val_qa.pickle'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    questions = df[['questions']].values.tolist()
    word_idx = load_idx()
    seq_list = []

    for question in questions:
        words = word_tokenize(question[0])
        seq = []
        for word in words:
            seq.append(word_idx.get(word,0))
        seq_list.append(seq)
    question_matrix = pad_sequences(seq_list)
  
    question_matrix.astype('int32')
    return question_matrix

In [None]:
small_question_train_tokenize, small_question_val_tokenize, word_idx = get_question_tokenizer(types = "small")

In [None]:
h5f = h5py.File('small_question_train_tokenize.h5', 'w')
h5f.create_dataset('small_question_train_tokenize', data=small_question_train_tokenize)
h5f.close()

h5f_val = h5py.File('small_question_val_tokenize.h5', 'w')
h5f_val.create_dataset('small_question_val_tokenize', data=small_question_val_tokenize)
h5f_val.close()

In [None]:
import pickle

file = open("/content/drive/MyDrive/VQA/Preprocessed Data/word_idx.pickle", "wb")
pickle.dump(word_idx, file)
file.close()


In [None]:
import numpy as np
def loadGloveModel(gloveFile, word_index):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    embedding_index = {}
    print("Opened!")
    for j, line in enumerate(f):
        splitLine = line.split(' ')
        word = splitLine[0]
        embedding = np.asarray(splitLine[1:], dtype='float32')
        embedding_index[word] = embedding
    
    print("Done.",len(embedding_index)," words loaded!")
  
    # Now, we need to create embedding matrix.
    EMBEDDING_DIM = 300
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    print(embedding_matrix.shape)
  
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [None]:
gloveFile = '/content/drive/MyDrive/Project dataset/glove.840B.300d.txt'
file = open("/content/drive/MyDrive/VQA/Preprocessed Data/word_idx.pickle", "rb")
word_idx = pickle.load(file)
file.close()
embedding_matrix_tokenize = loadGloveModel(gloveFile, word_idx)

In [None]:
file = open("/content/drive/MyDrive/VQA/Preprocessed Data/embedding_matrix_tokenize.pickle", "wb")
pickle.dump(embedding_matrix_tokenize, file)
file.close()

In [None]:
h5_feats = h5py.File('/content/drive/MyDrive/VQA/Preprocessed Data/embedding_matrix_tokenize.h5', 'w')
h5_feats.create_dataset('embedding_matrix_tokenize', data = embedding_matrix_tokenize)
h5_feats.close()

In [None]:
questions = ["what is the stripe on the train"]
with open('/content/drive/MyDrive/VQA/Preprocessed Data/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)


train_question_tokenized = tokenizer.texts_to_sequences(questions)      
questions = pad_sequences(train_question_tokenized, maxlen = 30)          # len(X_train) x 30
questions

In [None]:
def int_to_answers():
    
    data_path = '/content/drive/MyDrive/VQA/Preprocessed Data/Training Data QA.pickle'
    df = pd.read_pickle(data_path)
    answers = df['multiple_choice_answer'].values.tolist()
    freq = defaultdict(int)
    for answer in answers:
        freq[answer[0].lower()] += 1
    int_to_answer = sorted(freq.items(),key=operator.itemgetter(1),reverse=True)[0:1000]
    int_to_answer = [answer[0] for answer in int_to_answer]
    return int_to_answer

top_answers = int_to_answers()	

def answers_to_onehot():
	top_answers = int_to_answers()
	answer_to_onehot = {}
	for i, word in enumerate(top_answers):
		onehot = np.zeros(1001)
		onehot[i] = 1.0
		answer_to_onehot[word] = onehot
	return answer_to_onehot
	
answer_to_onehot_dict = answers_to_onehot()

def get_answers_matrix(split, types):
  
    if split == 'train':
        data_path = '/content/drive/MyDrive/VQA/Preprocessed Data/Training Data QA.pickle'
        if ( types == "small"):
            num_data = 120000
        elif (types == "full"):
            num_data = 2483490 

    elif split == 'val':
        data_path = '/content/drive/MyDrive/VQA/Preprocessed Data/Training Data QA.pickle'
        if (types == "small"):
            num_data = 24000
        elif (types == "full"):
            num_data = 1215120 
    else:
        print('Invalid split!')
        sys.exit()
     
    df = pd.read_pickle(data_path)
    answers = df['multiple_choice_answer'].values.tolist()
    answer_matrix = np.zeros((len(answers),1001))
    default_onehot = np.zeros(1001)
    default_onehot[1000] = 1.0
	
    for i, answer in enumerate(answers):
        answer_matrix[i] = answer_to_onehot_dict.get(answer[0].lower(),default_onehot)
	
    answer_matrix.astype('int32')
    return answer_matrix[0:num_data]

In [None]:
import numpy as np
import h5py
%cd /content/drive/MyDrive/VQA/Preprocessed Data
print('Loading answers ...')
small_answers_train = get_answers_matrix('train', types = "small") # float64
small_answers_val = get_answers_matrix('val', types = "small")


h5_ans = h5py.File('small_answers_train.h5', 'w')
h5_ans.create_dataset('small_answers_train', data = small_answers_train)
h5_ans.close()

h5_ans_val = h5py.File('small_answers_val.h5', 'w')
h5_ans_val.create_dataset('small_answers_val', data = small_answers_val)
h5_ans_val.close()