In [1]:
%cd "drive/My Drive/VQA-master"

/content/drive/My Drive/VQA-master


In [0]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image
from torch.utils import data
import numpy as np
import torch
import torch.optim as optim

# from models.Simple_CNN_LSTM_Model import SimpleCNNLSTM, SimpleLSTM
import models.Simple_CNN_LSTM_Model as models


In [0]:
use_image_features=True
batch_size = 1000
print_every = 10
options = {1: "image, question", 2: "image features, question", 3: "concatenated image features, question embeddings"}
model_input = options[3]

#### Load data

In [0]:
train_answers = np.load("train_answers.npy")
tensor_train_answers = torch.Tensor(train_answers).flatten()
tensor_train_answers = tensor_train_answers.type(torch.long)

val_answers = np.load("val_answers.npy")
tensor_val_answers = torch.Tensor(val_answers).flatten()
tensor_val_answers = tensor_val_answers.type(torch.long)


if model_input == "image, question":
  train_images = np.load("train_image_list.npy")
  train_questions = np.load("train_questions.npy")

  val_images = np.load("val_image_list.npy")
  val_questions = np.load("val_questions.npy")

  tensor_train_images = torch.Tensor(train_images)
  tensor_train_questions = torch.Tensor(train_questions)

  tensor_val_images = torch.Tensor(val_images)
  tensor_val_questions = torch.Tensor(val_questions)

  trainset = data.TensorDataset(tensor_train_images,tensor_train_questions,tensor_train_answers)
  valset = data.TensorDataset(tensor_val_images,tensor_val_questions,tensor_val_answers)

  num_workers = 0

elif model_input == "image features, question":
  train_images = np.load("train_image_features.npy")
  train_questions = np.load("train_questions.npy")

  val_images = np.load("val_image_features.npy")
  val_questions = np.load("val_questions.npy")

  tensor_train_images = torch.Tensor(train_images)
  tensor_train_questions = torch.Tensor(train_questions)

  tensor_val_images = torch.Tensor(val_images)
  tensor_val_questions = torch.Tensor(val_questions)

  trainset = data.TensorDataset(tensor_train_images,tensor_train_questions,tensor_train_answers)
  valset = data.TensorDataset(tensor_val_images,tensor_val_questions,tensor_val_answers)

  num_workers = 2

elif model_input == "concatenated image features, question embeddings":
  train_input = np.load("combined_train_input.npy")
  val_input = np.load("combined_val_input.npy")

  tensor_train_input = torch.Tensor(train_input)
  tensor_val_input = torch.Tensor(val_input)

  trainset = data.TensorDataset(tensor_train_input, tensor_train_answers)
  valset = data.TensorDataset(tensor_val_input, tensor_val_answers)
  
  num_workers = 2

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=True, num_workers=num_workers)


#### Choose model

In [0]:
weights_matrix = np.load("weights_matrix.npy")
weights_matrix = torch.tensor(weights_matrix)
model = models.SimpleCNNLSTM(weights_matrix, 2048, 1, model_input=model_input)

#### Train

In [0]:
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [0]:
def get_accuracy(predictions, labels):
  predictions = torch.max(predictions, axis=1)[1]
  ab = torch.abs(predictions-labels)
  ab = ab.detach().numpy()
  mn = np.minimum(ab, 1)
  eq = 1-mn
  correct = np.sum(eq)
  total = eq.shape[0]
  return correct, total

In [8]:
for epoch in range(500):  # loop over the dataset multiple times

    running_loss = 0.0
    correct = 0
    total = 0
    total_loss = 0.0
    model.train()

    for i, data in enumerate(trainloader, 0):
        if i % print_every == 0 and i>0:
          print("Train batch ",i+1)

        optimizer.zero_grad()

        if model_input == "image, question":
          images, questions, labels = data
          images = images.type(torch.float32)
          labels = labels.flatten().type(torch.long)
          labels = labels.view(-1)

          outputs = model(images, questions)


        elif model_input == "image features, question":
          image_features, questions, labels = data
          image_features = image_features.type(torch.float32)
          labels = labels.flatten().type(torch.long)
          labels = labels.view(-1)

          outputs = model(image_features, questions)


        elif model_input == "concatenated image features, question embeddings":
          inputs, labels = data
          inputs = inputs.type(torch.float32)
          labels = labels.flatten().type(torch.long)
          labels = labels.view(-1)

          outputs = model(inputs)


        batch_correct, batch_total = get_accuracy(outputs, labels)
        correct += batch_correct
        total += batch_total
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total_loss += running_loss
        running_loss = 0.0

    val_correct = 0
    val_total = 0
    model.eval()
    with torch.no_grad():
      for i, data in enumerate(valloader, 0):
          if i % print_every == 0 and i>0:
            print("Val batch ",i+1)

          if model_input == "image, question":
            images, questions, labels = data
            images = images.type(torch.float32)
            labels = labels.flatten().type(torch.long)
            labels = labels.view(-1)

            outputs = model(images, questions)


          elif model_input == "image features, question":
            image_features, questions, labels = data
            image_features = image_features.type(torch.float32)
            labels = labels.flatten().type(torch.long)
            labels = labels.view(-1)

            outputs = model(image_features, questions)


          elif model_input == "concatenated image features, question embeddings":
            inputs, labels = data
            inputs = inputs.type(torch.float32)
            labels = labels.flatten().type(torch.long)
            labels = labels.view(-1)

            outputs = model(inputs)

          batch_correct, batch_total = get_accuracy(outputs, labels)
          val_correct += batch_correct
          val_total += batch_total
          
    print("Epoch: ",epoch+1," Loss: ",total_loss," Train-Accuracy: ", correct/total," Val-Accuracy: ",val_correct/val_total)
    

print('Finished Training')

Epoch:  1  Loss:  33.939743995666504  Train-Accuracy:  0.12251112672757086  Val-Accuracy:  0.23297491039426524
Epoch:  2  Loss:  33.367817878723145  Train-Accuracy:  0.23143593347388147  Val-Accuracy:  0.23297491039426524
Epoch:  3  Loss:  33.35927391052246  Train-Accuracy:  0.23518388381353947  Val-Accuracy:  0.21863799283154123
Epoch:  4  Loss:  33.361892223358154  Train-Accuracy:  0.2344811431248536  Val-Accuracy:  0.24014336917562723
Epoch:  5  Loss:  33.363544940948486  Train-Accuracy:  0.24267978449285546  Val-Accuracy:  0.23775388291517324
Epoch:  6  Loss:  33.28286170959473  Train-Accuracy:  0.24877020379479972  Val-Accuracy:  0.2270011947431302
Epoch:  7  Loss:  33.25136613845825  Train-Accuracy:  0.258608573436402  Val-Accuracy:  0.24133811230585425
Epoch:  8  Loss:  33.256996631622314  Train-Accuracy:  0.2595455610213165  Val-Accuracy:  0.24133811230585425
Epoch:  9  Loss:  33.21515893936157  Train-Accuracy:  0.27453736237994847  Val-Accuracy:  0.24611708482676226
Epoch:  10

KeyboardInterrupt: ignored