## Part 1a: 词汇嵌入

当加载我们的csv文件时，我们可以看到我们的问题和答案是以一种奇怪的方式排列。

最好的办法是在一栏中写上问题，另一栏中写上它所指的图片，第三栏中写上答案。

In [2]:
import pandas as pd 
# 从文件路径中读取数据
data = pd.read_csv("./data/raw_data/DAQUAR_train_raw.csv",header=None)
# 查看数据的前五行
for i in range(10):
    print(data[0][i])

data.head()

what is on the right side of the black telephone and on the left side of the red chair in the image3 ?
desk
what is in front of the white door on the left side of the desk in the image3 ?
telephone
what is on the desk in the image3 ?
book  scissor  papers  tape_dispenser
what is the largest brown objects in this image3 ?
carton
what color is the chair in front of the white wall in the image3 ?
red


Unnamed: 0,0
0,what is on the right side of the black telepho...
1,desk
2,what is in front of the white door on the left...
3,telephone
4,what is on the desk in the image3 ?


The questions are every even row and the answers are every odd row. So we go through each row, check if even or odd and rewrite correctly in a csv

In [3]:
import os 
import csv

def prepare_data(in_directory,out_directory, mode):
    # 找到原来的数据
    file_name_in=os.path.join(in_directory,'DAQUAR_{}_raw.csv'.format(str(mode)))
    file_name_out=os.path.join(out_directory,'DAQUAR_{}_processed.csv'.format(str(mode)))
    
    # 打开文件
    with open(file_name_in, 'r') as f, open(file_name_out, 'w', newline='') as f_out:
        reader = csv.reader(f)
        
        fieldnames=['question','image','answer']
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        
        writer.writeheader()
        
        # 问题列，答案列
        row_skip=2
        dico={'question':None,
              'image':None,
              'answer':None}

        for index, row in enumerate(reader):
            
            #even number = question
            if index % row_skip ==0:
                #split the question at the 'image' key word
                question_image_list=row[0].split('image')

                dico['question']=[question_image_list[0]]
                
                #remove the question-mark and rewrite 'image' -> useful for integrating visual features later
                dico['image']='image'+question_image_list[1].replace(' ?','')
            
            else:
                dico['answer']=row
                
                #write row in the csv
                writer.writerow({'question': dico['question'], 'image':dico['image'], 'answer': dico['answer']})

                dico={'question':None,
                 'image':None,
                'answer':None}

In [4]:
prepare_data(in_directory='./data/raw_data', out_directory='./data/processed_data', mode='train')
prepare_data(in_directory='./data/raw_data', out_directory='./data/processed_data', mode='test')

FileNotFoundError: [Errno 2] No such file or directory: './data/processed_data\\DAQUAR_train_processed.csv'

In [76]:
import pandas as pd 
data = pd.read_csv("./data/processed_data/DAQUAR_train_processed.csv",header=None)
data.head()

Unnamed: 0,0,1,2
0,question,image,answer
1,['﻿what is on the right side of the black tele...,image3,['desk']
2,['what is in front of the white door on the le...,image3,['telephone']
3,['what is on the desk in the '],image3,['book scissor papers tape_dispenser']
4,['what is the largest brown objects in this '],image3,['carton']


## Part 1b: 创建单词列表

In [5]:
import pandas as pd

## 读取csv文件

train_dir='./data/processed_data/DAQUAR_train_processed.csv'
test_dir='./data/processed_data/DAQUAR_test_processed.csv'

data_train=pd.read_csv(train_dir)
data_test=pd.read_csv(test_dir)

In [6]:
# 确保tensorflow的版本不是2.0
import tensorflow as tf
print(tf.__version__)
from keras.preprocessing.text import Tokenizer

# 创建Tokenizer实例
MAX_WORDS = 3000
tokenizer = Tokenizer(num_words = MAX_WORDS, split=' ')

tokenizer.fit_on_texts(data_train['question'])
tokenizer.fit_on_texts(data_train['answer'])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.2.1


Using TensorFlow backend.


In [7]:
import numpy as np

# 因为GloVe的词汇量大约有400K，文件的大小非常大。
# 因此，我们首先在本地加载GloVe，以提取存在于我们训练词汇中的单词
# 并将其保存为一个numpy文件。
# 然后再把它上传到Google Colab上。
# 由此，我们从一个1GB的文件变成一个3.6MB的文件

def create_embedding_matrix(tokenizer,directory,embed_dims):
    embeddings_index = {}
    with open(directory,encoding='utf8') as f:
        #processing the text
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    # 得到我们在训练集中的单词列表
    word_index=tokenizer.word_index.items()
    # 单词嵌入的维度，这里取300
    EMBEDDING_DIM=embed_dims
    
    # 创建一个嵌入矩阵
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [15]:
directory='./data/embedding/glove.6B.300d.txt'
embed_dims=300

embedding_matrix=create_embedding_matrix(tokenizer, directory,embed_dims)
# 把它保存为npy文件
np.save('./data/embedding/glove_300d_embedding.npy', embedding_matrix)

## Part 2: 视觉特征
我们从VGG19的最后一层中获得视觉特征。并将这些特征中的每一个附加到正确的问题/答案上，作为一个deque中的元组，这就像一个列表，但更强大。然后，这个deque文件将被加载到我们的Colab笔记本的内存中，以便训练我们的问题-回答模型

In [8]:
## 重复之前的步骤

import pandas as pd


train_dir='./data/processed_data/DAQUAR_train_processed.csv'
test_dir='./data/processed_data/DAQUAR_test_processed.csv'

data_train=pd.read_csv(train_dir)
data_test=pd.read_csv(test_dir)

In [9]:

import tensorflow as tf
print(tf.__version__)
from keras.preprocessing.text import Tokenizer

MAX_WORDS = 3000
tokenizer = Tokenizer(num_words = MAX_WORDS, split=' ')

tokenizer.fit_on_texts(data_train['question'])
tokenizer.fit_on_texts(data_train['answer'])

1.2.1


In [10]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences 
import numpy as np


def tokenization(tokenizer, length_of_sequence, dataset, multiple_answer=True):
    MAX_LEN=length_of_sequence

    seqs_question = tokenizer.texts_to_sequences(dataset['question'])
    seqs_answer = tokenizer.texts_to_sequences(dataset['answer'])

    #'post'时，如果句子太长，你会把句子末尾的字去掉
    pad_seqs_question = pad_sequences(seqs_question,MAX_LEN,truncating='post')
    pad_seqs_answer = pad_sequences(seqs_answer,MAX_LEN,truncating='post')

    #选择保留一个还是多个回答
    if multiple_answer is False:
        pad_seqs_answer_one_answer = pad_seqs_answer[:,[MAX_LEN-1]]
        return pad_seqs_question, dataset['image'], pad_seqs_answer_one_answer

    else:
        return pad_seqs_question, dataset['image'], pad_seqs_answer

In [11]:
#问题的最大词数
MAX_LEN=25

train_questions,train_images,train_answers = tokenization(tokenizer, MAX_LEN, data_train, multiple_answer=False)
test_questions,test_images,test_answers = tokenization(tokenizer, MAX_LEN, data_test, multiple_answer=False)

这是一个json文件，有我们需要的每张图片的所有视觉特征。它是一个字典，你可以通过使用'imageX'来调用一个图像


ex: feat['image3']

In [12]:
import json

# 加载视觉特征
with open('./data/img_features.json', 'r') as f:
    feat = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: './data/img_features.json'

In [92]:
from collections import deque

#使用try-catch结构防止图片的文件名写错了
def fill_deque_with_data(visual_features,questions,images,answers,a_deque):
    
    error=0
    index_error_images=[]

    for i in range(len(questions)):
        image_name=images[i]
        try:
            a_deque.append((questions[i],visual_features[image_name],answers[i]))
        except Exception as e:
            print(e)
            error+=1
            index_error_images.append(i)

    return error, index_error_images

In [99]:
import pickle

### TRAINING SET ####
train_deque=deque()
error, index = fill_deque_with_data(visual_features=feat,
                                    questions=train_questions,
                                    images=train_images,
                                    answers=train_answers,
                                    a_deque=train_deque)

#保存为txt文件
pickleFile = open("./data/processed_data/questions-visual_features-train.txt", 'wb')
pickle.dump(train_deque, pickleFile)
pickleFile.close()

'image10 behind the door frame in fornt of the cabinet in the '
'image912 close to the wall in the '
'image116 close to the shelf in the '
'image135 that is on the counter in the '
'image139 on the counter in the '
'image95 behind the clothes in the '
'image114 on the table in the '
'image929 in the '
'image1007 in the '
'image1008 in the '
'image1008 in the '
'image1035 in the '
'image1043 in the '


In [100]:
### TEST SET ####
test_deque=deque()
error, index = fill_deque_with_data(visual_features=feat,
                                    questions=test_questions,
                                    images=test_images,
                                    answers=test_answers,
                                    a_deque=test_deque)

#save as a text file 
pickleFile = open("./data/processed_data/questions-visual_features-test.txt", 'wb')
pickle.dump(test_deque, pickleFile)
pickleFile.close()

'image1206 on the floor in the '
'image1285 on the floor in the '
'image1170 which contains some book in the '
'image1400 in the mirror reflection in the '
'image155 made of in the '
'image168 in the '
'image1011 in the '
'image1407 in the '
