In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time 
import math
import matplotlib.pyplot as plt
torch.manual_seed(1)
from torch.utils.data import DataLoader
import h5py
from torch.utils.data import TensorDataset
import numpy as np
import cv2
np.random.seed(0)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import torchvision
from torchvision import datasets, transforms
from torchvision import models as torchmodels
torch.cuda.empty_cache()

In [2]:
def train_model(model,epochs,trainloader,validationloader,verbose=False,plot=False,save=False,loc='chpt'):
    '''
    args:
    
    model: pytorch model
    epochs: no of epochs
    trainloader: Data loader for train set
    validationloader: Data loader for validation set
    verbose: False/True : If true will print progress as we train 
    plot: If true will plot the model's loss, accuracy graphs.
    
    returns: 
            list[train_acc,valid_acc,train_loss,valid_loss]
    '''
    # LOSS FUNCTION
    
    loss_function = nn.BCELoss(reduction='mean')
    
    #OPTIMIZER
    
    optimizer = optim.Adam(model.parameters()) 
    train_loss, valid_loss = [], []
    train_acc,valid_acc=[],[]
    
    
    
    # TRAINING 
    for epoch in range(1, epochs+1):
        start_time = time.time()
        if verbose:
            print('epoch',epoch)
        ## training part 
        model.train()
        ta=0
        correctt=0
        correctv=0
        losst=0
        lossv=0
        t_k=0
        v_k=0
        c=1
        nb=math.ceil(train_size/batch_size)
        for data, target,idx_map in trainloader:
            c+=1
            t_k=t_k+1
            optimizer.zero_grad()
            
            imgs=load_batch_img('../data/train_images.hdf5',idx_map.cpu().numpy())
                
            output = model(imgs,data)
            del imgs
            torch.cuda.empty_cache()
            loss = loss_function(output, target)
            loss.backward()
            losst=losst+loss.item()
            optimizer.step()
            #print(output)
            #print(data.shape)
            acc=(output.argmax(dim=1) == target.argmax(dim=1)).float().sum().item()
            correctt =correctt + acc
            if verbose:
                print('\r'+'batch_no :'+str(c)+' /'+str(nb)+' acc_pred: '+str(acc/batch_size),end='')
        train_acc.append(correctt/train_size)
        train_loss.append(loss.item())
        if verbose or (epoch-1)%10==0:
            print('time for epoch',time.time()-start_time)
            print('train_loss',losst/t_k)
            print('training accuracy',correctt/train_size)

            
            
            
            
        ## VALIDATION CHECK  
        model.eval()
        h=0

        for data, target,idx_map  in validationloader:
            v_k=v_k+1
            h+=1
            imgs=load_batch_img('../data/train_images.hdf5',idx_map.cpu().numpy())
                
            output = model(imgs,data)
            del imgs
            torch.cuda.empty_cache()
            loss = loss_function(output, target)
            lossv=lossv+loss.item()
            accv=(output.argmax(dim=1) == target.argmax(dim=1)).float().sum().item()
            correctv =correctv + accv
            if verbose:
                print('\r'+'batch_no'+str(h)+' val_acc_pred: '+str(accv/batch_size),end='')
        valid_loss.append(lossv/v_k+1)
        valid_acc.append(correctv/val_size)
        
        if verbose or (epoch-1)%10==0:
            print('valid_loss',lossv/v_k+1)
            print('validation accuracy',correctv/val_size)
            
            
    # SAVING CHECKPOINT
    
    if save:
        torch.save({
                'epoch': epochs,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'model':model
                }, loc)
    
    print('train_loss',losst/t_k)
    print('valid_loss',lossv/v_k+1)
    print('training accuracy',correctt/train_size)
    print('validation accuracy',correctv/val_size)
    
    
    
    #PLOTTING 
    if plot:
        epc=np.arange(1,epochs+1)
        plt.plot(epc,train_acc)
        plt.plot(epc,valid_acc)
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()
        
        plt.plot(epc,train_loss)
        plt.plot(epc,valid_loss)
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()
        
        
        
    return train_acc,valid_acc,train_loss,valid_loss,optimizer

In [52]:
def test_model(model,x_test_t,y_test,x_idx,disp=False):
    '''
    args:
    
    model: pytorch model
    x_test: test  set features
    y_test: test set labels
    
    returns:
            -
    prints:
         accuracy
         precision
         recall
         f-score
         confusion matrix

    '''
    out=np.zeros((y_test.shape[0],2))
    ind=0
    
    y_test2=np.zeros((y_test.shape[0],2))
    
    for i in range(y_test.shape[0]):
        if y_test[i]:
            y_test2[i][1]=1

        else:
            y_test2[i][0]=1
    vocab_dict=np.load('../data/vocabDict.npy',allow_pickle=True).item()
    key_list = list(vocab_dict.keys()) 
    val_list = list(vocab_dict.values())
    y_test=y_test2
    

    x_vl_t = torch.tensor(x_test_t, dtype=torch.long,device=device)
    y_vl = torch.tensor(y_test, dtype=torch.float32,device=device)
    x_t_idx=torch.tensor(x_idx.astype('int32'),dtype=torch.long,device=device)
    testt = TensorDataset(x_vl_t, y_vl,x_t_idx)
    testloader = DataLoader(testt, batch_size=32)
    
    model.eval()
    v_k=0
    lossv=0
    correctv=0
    y_out=[]
    h=0
    for data, target,idx_map in testloader:
        h+=1
        ind_l=ind+target.shape[0]
        v_k=v_k+1

        imgs=load_batch_img('../data/val_images.hdf5',idx_map.cpu().numpy())
        
        if disp:
            s=''
            for i in data.cpu().numpy()[0]:
                if i in val_list:
                    s=s+' '+key_list[val_list.index(i)]
                else:
                    s=s+' '+'UNK'
            print(s)
            img=imgs.cpu().numpy()[0]
            img=np.moveaxis(img, 0, -1)
            plt.imshow(img.astype(int))
            plt.show()
            print(img.max())
            if target[0][0]==1:
                print('No')
            else:
                print('Yes')

            print('...........\n')
        
        
        output = model(imgs,data)
        
        out2=output.cpu()
        out2=out2.detach().numpy()
        out2=np.squeeze(out2)
        #print(out2.shape,'aaa')
        #print(out[ind:ind_l].shape,'aaaaa')
        #print(ind,ind_l)
        out[ind:ind_l]=out2
        ind=ind_l
        accv=(output.argmax(dim=1) == target.argmax(dim=1)).float().sum().item()
        print('\r'+'batch_no'+str(h)+' test_acc_pred: '+str(accv/batch_size),end='')
        correctv =correctv + accv
        del imgs
        torch.cuda.empty_cache()
        
    print('\n accuracy')
    print(correctv/x_test.shape[0])
    y_true=y_test.argmax(axis=1)
    y_pred=out.argmax(axis=1)
    print('confusion matrix :\n',confusion_matrix(y_true,y_pred),'\n')
    print('f1 score matrix :\n',f1_score(y_true,y_pred,average='micro'),'\n')
    print('precision_score :\n',precision_score(y_true,y_pred,pos_label=1,average='micro'),'\n')
    print('recall_score :\n',recall_score(y_true,y_pred,pos_label=1,average='micro'),'\n')
    print('classification_report :\n',classification_report(y_true,y_pred),'\n')

In [4]:
def VGG_EXT(vgg):
    weights_4096=list(vgg.children())[-1][0].weight
    
    
    return nn.Sequential((*list(vgg.children())[:-1])),weights_4096


In [5]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [6]:
def trainable_prams(model):
    for p in model.parameters():
        if p.requires_grad:
            print(p.shape)

### BASIC MODEL

In [7]:
class base_enc(nn.Module):
    def __init__(self, vocab_size, embedding_dim,feat_ext_model,
                 hidden_size_lstm,weight_4096=None,emb_w=None,emb_Train=False,feat_ext_Train=False,dropout=0.4):
        super(base_enc,self).__init__()
        
        #IMAGE FEATURE EXTRACTOR 
        self.h_size=hidden_size_lstm
        self.feature_extractor = feat_ext_model 
        
        for param in self.feature_extractor.parameters():
            param.requires_grad = feat_ext_Train
        self.conv_linear1=nn.Linear(25088,4096)
        if weight_4096 is not None:
            self.conv_linear1.weight.data=weight_4096
            self.conv_linear1.weight.requires_grad=False

        self.conv_linear2=nn.Linear(4096,self.h_size*15)
        #TEXT FEATURE EXTRACTOR 

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if emb_w is not None:
            et = torch.tensor(emb_w, dtype=torch.float32,device=device)
            self.embedding.weight = nn.Parameter(et)
            self.embedding.weight.requires_grad = emb_Train
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(dropout)
        self.embed_size=embedding_dim
        self.lstm = nn.LSTM(self.embed_size, self.h_size,dropout=dropout)        
        #self.text_linear1= nn.Linear(self.h_size*15, 512)
        
        # General 
        self.nn0=nn.Linear(self.h_size*15,512)
        self.nn1=nn.Linear(512,128)
        self.nn2=nn.Linear(128,2)
        
        self.relu = nn.ReLU()
        self.tanh=nn.Tanh()
        self.softmax = nn.Softmax()
        
        
    def img_feat_ext(self,image):
        im1= self.feature_extractor(image)
        #print(im1.shape)
        im1=im1.view(im1.size(0), -1)
        im2=self.dropout(self.relu(self.conv_linear1(im1)))
        return self.relu(self.dropout(self.conv_linear2(im2)))
    
    def text_feat_ext(self,text):
        h_embedding = self.dropout(self.embedding(text))       
        h_lstm, _ = self.lstm(h_embedding)
        conc_h=h_lstm.view(h_lstm.size(0),-1)
        
        return conc_h
    
        
    def forward(self, image,text):
                
        image_emb=self.img_feat_ext(image)
        text_emb=self.text_feat_ext(text)
        
        ######## Concatenating feature embeddings
        
        #print(image_emb.shape)
        #joint_emb=torch.cat((image_emb, text_emb), 1)
        #print(joint_emb.shape)
        
        ###### addding embeddings
        joint_emb=image_emb*text_emb
        out1=self.dropout(self.relu(self.nn0(joint_emb)))
        out2=self.dropout(self.relu(self.nn1(out1)))
        out3=self.nn2(out2)
    
      
        return self.softmax(out3)

### LOADING DATA INTO MEMORY

if we have numpy arrays of training images, training text, and class labels it will be easier to load.

#### images of shape (no ofinstances ,3,224,224)

#### text of shape ( no of instances ,fixed_text_length_n).  ie(no_of_instances,(idx_word1,idx_word2,............,idx_word_n))

#### labels of shape (no of instances, class idx)

In [8]:
def load_batch(X_train_images,batch_no,batch_size=64): #batch number starting from 0,1,2,3.. batch size default = 64
    data={}
    start_in= batch_no*batch_size
    end_in= (batch_no+1)*batch_size
    #print (start_in, end_in)
    count = 0
    for i in X_train_images:
        if ((count >=start_in) and (count <end_in)): 
            data.update({i: X_train_images[i]}) 
        count +=1;
    return data;

In [9]:
def load_batch_img(file_loc,idx_map):
    imgs=np.zeros((idx_map.shape[0],3,224,224))
    with h5py.File(file_loc, 'r') as f:
        data = f['images']
        for i in range(idx_map.shape[0]):
            imgs[i]=data[idx_map[i]]
    return torch.tensor(imgs,dtype=torch.float,device=device)
    

In [10]:
#X_train_images=np.load('../data/X_train_img.npy')
X_train_text=np.load('../data/Question_train_pad.npy')
y_train=np.load('../data/answers_train.npy')
X_train_idx=np.load('../data/question_image_map_train_array.npy')

#X_test_images=np.load('../data/X_test_img.npy')


#X_val_images=np.load('../data/X_val_img.npy')
#X_val_text=np.load('../data/X_val_text.npy')
#y_val=np.load('../data/y_val2.npy')
#X_val_idx=np.load('../data/val_img_map.npy')


#emb_w=np.load('../data/word_embeddings.npy')
glove=np.load('../data/glove_embedding.npy')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
yes=0
no=0
y_o=np.zeros((y_train.shape[0],2))
for i in range(y_train.shape[0]):
    if y_train[i]:
        y_o[i][1]=1
        yes+=1
    else:
        y_o[i][0]=1
        no+=1
  
        

In [12]:
y_o,y_train,yes,no

(array([[1., 0.],
        [0., 1.],
        [1., 0.],
        ...,
        [0., 1.],
        [1., 0.],
        [0., 1.]]),
 array([False,  True, False, ...,  True, False,  True]),
 84615,
 82267)

### LOADING DATA INTO GPU AND CREATING DATA ITERATORS 

we can load into GPU batch by batch if encounter gpu size issues, but it will be slower

In [13]:
batch_size=78
x_tr_t = torch.tensor(X_train_text,dtype=torch.long,device=device)
y_tr = torch.tensor(y_o,dtype=torch.float,device=device)
x_tr_idx=torch.tensor(X_train_idx.astype('int32'),dtype=torch.long,device=device)
x_tr_idx.shape,x_tr_t.shape,y_tr.shape

(torch.Size([166882]), torch.Size([166882, 15]), torch.Size([166882, 2]))

In [14]:
train_size=x_tr_t.shape[0]
val_size=int(0.2*train_size)
train_size=train_size-val_size
dataset = TensorDataset(x_tr_t, y_tr,x_tr_idx)
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

trainloader = DataLoader(train_dataset, batch_size=batch_size)

validloader = DataLoader(valid_dataset, batch_size=batch_size)

### LOADING PRETRAINED VGG

In [15]:
VGG16 = torchmodels.vgg16_bn(pretrained=True)

In [16]:
#device=torch.device(device)
VGG_16,w_4096=VGG_EXT(VGG16)

In [17]:
#img=torch.zeros((10,3,224,224))
#text=torch.zeros((10,15),dtype=torch.long)
#vocab_size, embedding_dim,feat_ext_model,hidden_size_lstm,
model_basic=base_enc(10471,300,VGG_16,128,emb_Train=True,weight_4096=w_4096)
print(count_parameters(model_basic))
#trainable_prams(model_basic)
model_basic=model_basic.to(device)


  "num_layers={}".format(dropout, num_layers))


12281270


### TRAINING MODEL

In [18]:
epochs=20
_=train_model(model_basic,epochs,trainloader,validloader,plot=True,verbose=True,save=True,loc='../checkpoints/basic_model3v2.pt')

epoch 1




batch_no :1713 /1712 acc_pred: 0.33333333333333336time for epoch 2029.6591002941132
train_loss 0.6927266952311881
training accuracy 0.5171452968405914
batch_no428 val_acc_pred: 0.54871794871794873valid_loss 1.691790198750585
validation accuracy 0.5195949185043145
epoch 2
batch_no :1713 /1712 acc_pred: 0.33333333333333336time for epoch 1956.256514787674
train_loss 0.6894726350416089
training accuracy 0.5333093643731368
batch_no428 val_acc_pred: 0.51282051282051286valid_loss 1.6862847661582108
validation accuracy 0.53376677852349
epoch 3
batch_no :1713 /1712 acc_pred: 0.34615384615384615time for epoch 1910.8537726402283
train_loss 0.6861932232767065
training accuracy 0.5464024088804997
batch_no428 val_acc_pred: 0.57435897435897434valid_loss 1.6852356570067806
validation accuracy 0.538470757430489
epoch 4
batch_no :1713 /1712 acc_pred: 0.34615384615384615time for epoch 1970.676749944687
train_loss 0.6820170269739405
training accuracy 0.5565667460638474
batch_no428 val_acc_pred: 0.46153846

KeyboardInterrupt: 

In [21]:
_

(torch.Size([166882]), torch.Size([166882, 15]), torch.Size([166882, 2]))

In [40]:
torch.cuda.empty_cache()

In [24]:
torch.save({
                'epoch': epochs,
                'model_state_dict': model_basic.state_dict(),
                #'optimizer_state_dict': optimizer.state_dict(),
                'model':model_basic
                }, '../checkpoints/basic_modelv3.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [25]:
X_test_text=np.load('../data/Question_val_pad.npy')
y_test=np.load('../data/answers_val.npy')
X_test_idx=np.load('../data/question_image_map_val_array.npy')

In [53]:
test_model(model_basic,X_test_text,y_test,X_test_idx)



batch_no23 test_acc_pred: 0.20512820512820512

KeyboardInterrupt: 