In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import matplotlib.pyplot as plt
torch.manual_seed(1)
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import numpy as np
np.random.seed(0)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import torchvision
from torchvision import datasets, transforms
from torchvision import models as torchmodels

In [2]:
def train_model(model,epochs,trainloader,validationloader,verbose=False,plot=False,save=False,loc='chpt'):
    '''
    args:
    
    model: pytorch model
    epochs: no of epochs
    trainloader: Data loader for train set
    validationloader: Data loader for validation set
    verbose: False/True : If true will print progress as we train 
    plot: If true will plot the model's loss, accuracy graphs.
    
    returns: 
            list[train_acc,valid_acc,train_loss,valid_loss]
    '''
    # LOSS FUNCTION
    
    loss_function = nn.CrossEntropyLoss(reduction='mean')
    
    #OPTIMIZER
    
    optimizer = optim.Adam(model.parameters())
    
    
    
    train_loss, valid_loss = [], []
    train_acc,valid_acc=[],[]
    
    
    
    # TRAINING 
    for epoch in range(1, epochs+1):
        if verbose:
            print('epoch',epoch)
        ## training part 
        model.train()
        ta=0
        correctt=0
        correctv=0
        losst=0
        lossv=0
        t_k=0
        v_k=0
        c=1
        nb=math.ceil(X_train.shape[0]/batch_size)
        for data,data2, target in trainloader:
            if verbose:
                print('\r'+'batch_no :'+str(c)+' /'+str(nb),end='')
            c+=1
            t_k=t_k+1
            optimizer.zero_grad()
            output = model(data,data2)
            loss = loss_function(output, target.argmax(dim=1))
            loss.backward()
            losst=losst+loss.item()
            optimizer.step()
            #print(output)
            #print(data.shape)
            acc=(output.argmax(dim=1) == target.argmax(dim=1)).float().sum().item()
            correctt =correctt + acc
        train_acc.append(correctt/X_train.shape[0])
        train_loss.append(loss.item())
        if verbose or (epoch-1)%10==0:
            print('train_loss',losst/t_k)
            print('training accuracy',correctt/X_train.shape[0])

            
            
            
            
        ## VALIDATION CHECK  
        model.eval()
        

        for data,data2, target in validationloader:
            v_k=v_k+1
            output = model(data,data2)
            loss = loss_function(output, target.argmax(dim=1))
            lossv=lossv+loss.item()
            accv=(output.argmax(dim=1) == target.argmax(dim=1)).float().sum().item()
            correctv =correctv + accv
        valid_loss.append(lossv/v_k+1)
        valid_acc.append(correctv/X_val.shape[0])
        
        if verbose or (epoch-1)%10==0:
            print('valid_loss',lossv/v_k+1)
            print('validation accuracy',correctv/X_val.shape[0])
            
            
    # SAVING CHECKPOINT
    
    if save:
        torch.save({
                'epoch': epochs,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'model':model
                }, loc)
    
    print('train_loss',losst/t_k)
    print('valid_loss',lossv/v_k+1)
    print('training accuracy',correctt/X_train.shape[0])
    print('validation accuracy',correctv/X_val.shape[0])
    
    
    
    #PLOTTING 
    if plot:
        epc=np.arange(1,epochs+1)
        plt.plot(epc,train_acc)
        plt.plot(epc,valid_acc)
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()
        
        plt.plot(epc,train_loss)
        plt.plot(epc,valid_loss)
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()
        
        
        
    return train_acc,valid_acc,train_loss,valid_loss

In [4]:
def test_model(model,x_test_i,x_text_t,y_test):
    '''
    args:
    
    model: pytorch model
    x_test: test  set features
    y_test: test set labels
    
    returns:
            -
    prints:
         accuracy
         precision
         recall
         f-score
         confusion matrix

    '''
    out=np.zeros((y_test.shape))
    ind=0
    loss_function = nn.BCEWithLogitsLoss(reduction='mean')
    x_vl = torch.tensor(x_test_i, dtype=torch.long,device=device)
    x_vl_t = torch.tensor(x_test_t, dtype=torch.long,device=device)
    y_vl = torch.tensor(y_test, dtype=torch.float32,device=device)
    testt = TensorDataset(x_vl,x_vl_t, y_vl)
    testloader = DataLoader(testt, batch_size=128)
    
    model.eval()
    v_k=0
    lossv=0
    correctv=0
    y_out=[]
    for data,data2, target in testloader:
        ind_l=ind+target.shape[0]
        v_k=v_k+1
        output = model(data,data2)
        out2=output.cpu()
        out2=out2.detach().numpy()
        out2=np.squeeze(out2)
        #print(out2.shape,'aaa')
        #print(out[ind:ind_l].shape,'aaaaa')
        #print(ind,ind_l)
        out[ind:ind_l]=out2
        ind=ind_l
        accv=(output.argmax(dim=1) == target.argmax(dim=1)).float().sum().item()
        correctv =correctv + accv
    print('accuracy')
    print(correctv/x_test.shape[0])
    y_true=y_test.argmax(axis=1)
    y_pred=out.argmax(axis=1)
    print('confusion matrix :\n',confusion_matrix(y_true,y_pred),'\n')
    print('f1 score matrix :\n',f1_score(y_true,y_pred,average='micro'),'\n')
    print('precision_score :\n',precision_score(y_true,y_pred,pos_label=1,average='micro'),'\n')
    print('recall_score :\n',recall_score(y_true,y_pred,pos_label=1,average='micro'),'\n')
    print('classification_report :\n',classification_report(y_true,y_pred),'\n')

In [5]:
def VGG_EXT(vgg):
    
    return nn.Sequential((*list(vgg.children())[:-1]))


### BASIC MODEL

In [6]:
class base_enc(nn.Module):
    def __init__(self, vocab_size, embedding_dim,feat_ext_model,
                 hidden_size_lstm,emb_w=None,emb_Train=False,feat_ext_Train=True,dropout=0.5):
        super(base_enc,self).__init__()
        
        #IMAGE FEATURE EXTRACTOR 
        self.feature_extractor = feat_ext_model 
        '''
        for param in self.feature_extractor.parameters():
            param.requires_grad = feat_ext_Train
        
        '''
        self.conv_linear1=nn.Linear(25088,4096)
        self.conv_linear2=nn.Linear(4096,2000)
        #TEXT FEATURE EXTRACTOR 

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if emb_w is not None:
            et = torch.tensor(emb_w, dtype=torch.float32,device=device)
            self.embedding.weight = nn.Parameter(et)
            self.embedding.weight.requires_grad = emb_Train
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(dropout)
        self.h_size=hidden_size_lstm
        self.embed_size=embedding_dim
        self.lstm = nn.LSTM(self.embed_size, self.h_size)        
        self.text_linear1= nn.Linear(self.h_size, 2000)
        
        # General 
        self.nn1=nn.Linear(4000,500)
        self.nn2=nn.Linear(500,2)
        
        self.relu = nn.ReLU()
        self.tanh=nn.Tanh()
        self.softmax = nn.Softmax()
        
        
    def img_feat_ext(self,image):
        im1= self.feature_extractor(image)
        #print(im1.shape)
        im1=im1.view(im1.size(0), -1)
        im2=self.relu(self.conv_linear1(im1))
        return self.relu(self.conv_linear2(im2))
    
    def text_feat_ext(self,text):
        h_embedding = self.dropout(self.embedding(text))       
        h_lstm, _ = self.lstm(h_embedding)
        last_state, _ = torch.max(h_lstm, 1)   
        return self.relu(self.text_linear1(last_state))
  
        
    def forward(self, image,text):
                
        image_emb=self.img_feat_ext(image)
        text_emb=self.text_feat_ext(text)
        
        ######## Concatenating feature embeddings
        
        #print(image_emb.shape)
        joint_emb=torch.cat((image_emb, text_emb), 1)
        #print(joint_emb.shape)
        
        out1=self.relu(self.nn1(joint_emb))
        out2=self.nn2(out1)
    
      
        return self.softmax(out2)

### LOADING DATA INTO MEMORY

if we have numpy arrays of training images, training text, and class labels it will be easier to load.

#### images of shape (no ofinstances ,3,224,224)

#### text of shape ( no of instances ,fixed_text_length_n).  ie(no_of_instances,(idx_word1,idx_word2,............,idx_word_n))

#### labels of shape (no of instances, class idx)

In [124]:
def load_batch(X_train_images,batch_no,batch_size=64): #batch number starting from 0,1,2,3.. batch size default = 64
    data={}
    start_in= batch_no*batch_size
    end_in= (batch_no+1)*batch_size
    #print (start_in, end_in)
    count = 0
    for i in X_train_images:
        if ((count >=start_in) and (count <end_in)): 
            data.update({i: X_train_images[i]}) 
        count +=1;
    return data;

In [None]:
def load_batch_img(file_loc,list):
    

In [5]:
#X_train_images=np.load('../data/X_train_img.npy')
X_train_text=np.load('../data/X_train_text.npy')
y_train=np.load('../data/y_train.npy')
X_train_idx=np.load('../data/train_img_map.npy')

#X_test_images=np.load('../data/X_test_img.npy')
X_test_text=np.load('../data/X_test_text.npy')
y_test=np.load('../data/y_test2.npy')
X_test_idx=np.load('../data/test_img_map.npy')

#X_val_images=np.load('../data/X_val_img.npy')
X_val_text=np.load('../data/X_val_text.npy')
y_val=np.load('../data/y_val2.npy')
X_val_idx=np.load('../data/val_img_map.npy')


#emb_w=np.load('../data/word_embeddings.npy')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### LOADING DATA INTO GPU AND CREATING DATA ITERATORS 

we can load into GPU batch by batch if encounter gpu size issues, but it will be slower

In [6]:
batch_size=batch_size
x_tr_t = torch.tensor(X_train_text,dtype=torch.long,device=device)
y_tr = torch.tensor(y_train,dtype=torch.long,device=device)
x_tr_idx=torch.tensor(X_train_idx,dtype=torch.long,device=device)
train = TensorDataset(x_tr_t, y_tr,x_tr_idx)
trainloader = DataLoader(train, batch_size=batch_size)

x_vl_t = torch.tensor(X_val_text, dtype=torch.long,device=device)
y_vl = torch.tensor(y_val, dtype=torch.long,device=device)
x_vl_idx=torch.tensor(X_val_idx,dtype=torch.long,device=device)
valid = TensorDataset(x_vl_t, y_vl,x_vl_idx)
validloader = DataLoader(valid, batch_size=batch_size)

### LOADING PRETRAINED VGG

In [7]:
VGG16 = torchmodels.vgg16_bn(pretrained=True)

In [8]:
device=torch.device("cpu")
VGG_16=VGG_EXT(VGG16)
model_basic=base_enc(5000,300,VGG_16,60)


In [11]:
img=torch.rand(1,3,224,224)
text=torch.zeros([1, 5], dtype=torch.long)

In [12]:
model_basic.forward(img,text)



tensor([[0.4844, 0.5156]], grad_fn=<SoftmaxBackward>)

### TRAINING MODEL

In [6]:
_=train_model(model_basic,epochs,trainloader,validloader,plot=True,verbose=False,save=True,loc='./checkpoints/basic_model.pt')

In [69]:
Q_t=np.load('../data/questions_val.npy',allow_pickle=True)

In [70]:
Q_t.shape

(80541,)

In [72]:
l=0 
for i in range(Q_t.shape[0]):
    l=l+len(Q_t[i])
l=l/Q_t.shape[0]

In [73]:
l

4.82927949739884

In [75]:
m=0
s=15
for i in range(Q_t.shape[0]):
    if len(Q_t[i])> s:
        m=m+1
m

58

In [76]:
Q_t_pad=np.zeros((Q_t.shape[0],15))
for i in range(Q_t.shape[0]):
    pad_idx=0
    pad_idx=15-len(Q_t[i])
    if(pad_idx<0):
        pad_idx=0
    for j in range(len(Q_t[i])):
        if j>14:
            break
        else:
            Q_t_pad[i][pad_idx+j]=Q_t[i][j]

In [77]:
Q_t_pad[0],Q_t[0]

(array([0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 5.000e+00,
        1.200e+01, 4.000e+00, 8.582e+03]), [5, 12, 4, 8582])

In [78]:
np.save('../data/Question_val_pad.npy',Q_t_pad)

In [94]:
train_map=np.load('../data/question_image_map_val.npy',allow_pickle=True).item()

In [95]:
train_map

{'393225001': 0,
 '393225002': 0,
 '109229002': 1,
 '109229012': 1,
 '131089001': 2,
 '131089003': 2,
 '131089004': 2,
 '262162000': 3,
 '262162004': 3,
 '262162006': 3,
 '262162007': 3,
 '262162008': 3,
 '262162010': 3,
 '262162012': 3,
 '262162013': 3,
 '262162014': 3,
 '262162015': 3,
 '262162016': 3,
 '262162019': 3,
 '262162020': 3,
 '262162024': 3,
 '262162025': 3,
 '393243000': 4,
 '393243002': 4,
 '262175000': 5,
 '262175003': 5,
 '131108000': 6,
 '131108002': 6,
 '42001': 7,
 '131115000': 8,
 '131115001': 8,
 '524333001': 9,
 '524333002': 9,
 '524333003': 9,
 '524333004': 9,
 '122208000': 10,
 '122208001': 10,
 '393266001': 11,
 '262197002': 12,
 '262197003': 12,
 '262197004': 12,
 '262197006': 12,
 '393271000': 13,
 '393271001': 13,
 '262200000': 14,
 '393274001': 15,
 '393274002': 15,
 '393274005': 15,
 '393274006': 15,
 '393274007': 15,
 '393277000': 16,
 '393277002': 16,
 '393277006': 16,
 '393277007': 16,
 '393277011': 16,
 '393277013': 16,
 '393277014': 16,
 '393282000':

In [88]:
train_map_2=np.zeros((Q_t.shape[0])).astype('uint32')

In [89]:
for i in  range(Q_t.shape[0]):
    train_map_2[i]=int(train_map[i])

In [90]:
train_map_2[999],train_map[999]

(398, 398)

In [91]:
np.save('../data/question_image_map_train_array.npy',train_map_2)