In [4]:
import os
import numpy as np
import tqdm
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import resnet.resnet as resnet
from resnet.resnet_utils import myResnet
import os
import random

from torchvision import transforms
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# Tokenizer

In [5]:
from typing import List
class Tokenizer():
  def __init__(self, ):
    self.word2idx = {}
    self.idx2word = {}
    self.idx = 3
    self.word2idx['<e>'] = 1
    self.word2idx['</e>'] = 2
    self.idx2word[1] = '<e>'
    self.idx2word[2] = '</e>'

  def fit(self, text):
    for word in text.split():
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

  @classmethod
  def pad_sequences(cls, lst_seq: List[List[int]], pad_value=0, pad_left=False, pad_to: int=None) -> List[List[int]]:
    maxlen = max(len(seq) for seq in lst_seq) if pad_to is None else pad_to
    if pad_left:
        padded_seq = [[pad_value] * (maxlen - len(seq)) + seq for seq in lst_seq]
    else:
        padded_seq = [seq + [pad_value] * (maxlen - len(seq)) for seq in lst_seq]
    return padded_seq

  def txt2vec(self, text, reverse=False, trunc=None):
    unknownidx = len(self.word2idx) + 1
    sequence = [self.word2idx.get(w, unknownidx) for w in text.split()]
    if reverse:
      sequence = sequence[::-1]
    if type(trunc) is int:
      sequence = sequence[:trunc]
    return sequence

  def vec2txt(self, vec):
    unknown = '<UNK>'
    text = ' '.join(self.idx2word.get(i, unknown) for i in vec)
    return text

# Dataset

In [6]:
from typing import Dict, Any, List

def load_word_vec(path: str, word2idx: Dict[str, int]):
  word_vec = {}
  for line in open(path, 'r', encoding='utf-8', newline='\n', errors='ignore'):
    tokens = line.rstrip().split(' ')
    # Get the glove word vector
    if tokens[0] in word2idx:
        word_vec[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
  return word_vec

def build_embedding_matrix(word2idx: Dict[str, int], embed_dim: int, type):
  embedding_matrix_file_name = f'{str(embed_dim)}_{type}_embedding_matrix.dat'

  print('loading word vectors...')
  embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim))  # 0 = pad token, len + 1 = unk token
  fname = f'./embeddings/glove.twitter.27B.{str(embed_dim)}d.txt'
  word_vec = load_word_vec(fname, word2idx=word2idx)
  print('building embedding_matrix:', embedding_matrix_file_name)

  for word, i in word2idx.items():
      vec = word_vec.get(word)
      if vec is not None:
          embedding_matrix[i] = vec

  return embedding_matrix

class Twitter15Dataset(Dataset):
  def __init__(self, split, opt):
    super().__init__()
    self.opt = opt
    self.maxlen = opt.max_seq_len
    self.fname = {
      'train': './datasets/twitter2015/train.txt',
      'dev': './datasets/twitter2015/dev.txt',
      'test': './datasets/twitter2015/test.txt'
    }
    self.img_path = './datasets/IJCAI2019_data/twitter2015_images'
    # image preprocessing
    self.transform = transforms.Compose([
      transforms.RandomCrop(opt.crop_size), #args.crop_size, by default it is set to be 224
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      transforms.Normalize((0.485, 0.456, 0.406),
                           (0.229, 0.224, 0.225))])
    self.tokenizer = Tokenizer()
    self.init_tokenizer([self.fname['train'], self.fname['dev'], self.fname['test']])
    # load once only
    if split == 'train':
      self.embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx, opt.embed_dim, 'twitter15')
    self.data = []
    self.load_data(self.fname[split])

  def image_process(self, image_path):
    image = Image.open(image_path).convert('RGB')
    image = self.transform(image)
    return image

  def show_sample(self, i):
    sample = self[i]
    for k, v in sample.items():
      if k != 'polarity' and type(v) is list:
        print(k, v, self.tokenizer.vec2txt(v))
      elif k == 'image':
        print(k, v.size())
      else:
        print(k, v)

  def collate_fn(self, batch_samples: List[Dict[str, Any]]):
    batch = {}
    for k in batch_samples[0].keys():
      if k == 'polarity':
        batch[k] = torch.tensor([b[k] for b in batch_samples], dtype=torch.long)
      elif k == 'image':
        batch[k] = torch.stack([b[k] for b in batch_samples], dim=0)
      elif type(batch_samples[0][k]) is list:
        padded_seq = self.tokenizer.pad_sequences([b[k] for b in batch_samples])
        batch[k] = torch.tensor(padded_seq, dtype=torch.long)
      else:
        raise ValueError(f'Unknown data: {k}')
    return batch

  def __getitem__(self, index):
    return self.data[index]

  def __len__(self):
    return len(self.data)

  def init_tokenizer(self, fnames):
    text = ''
    for fname in fnames:
      with open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        lines = f.readlines()
      for i in range(0, len(lines), 4):
        text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
        aspect = lines[i + 1].lower().strip()
        text_raw = text_left + " " + aspect + " " + text_right
        text += text_raw + " "
    self.tokenizer.fit(text.lower())

  def load_data(self, fname):
    print('--------------'+fname+'---------------')
    with open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
      lines = f.readlines()
    count = 0
    for i in tqdm.tqdm(range(0, len(lines), 4)):
      text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
      aspect = lines[i + 1].lower().strip()
      label = lines[i + 2].strip()
      image_id = lines[i + 3].strip()

      text_left_for_fusion = text_left + " <e>"
      text_right_for_fusion = "</e> " + text_right

      text_raw_indices = self.tokenizer.txt2vec(text_left + " " + aspect + " " + text_right)
      text_raw_without_aspect_indices = self.tokenizer.txt2vec(text_left + " " + text_right)

      text_left_indicator = self.tokenizer.txt2vec(text_left_for_fusion)
      text_left_with_aspect_indices = self.tokenizer.txt2vec(text_left + " " + aspect)

      text_right_indicator = self.tokenizer.txt2vec(text_right_for_fusion)
      text_right_with_aspect_indices = self.tokenizer.txt2vec(" " + aspect + " " + text_right, reverse=True)

      aspect_indices = self.tokenizer.txt2vec(aspect)
      label = int(label) + 1

      image_name = image_id
      image_path = os.path.join(self.img_path, image_name)

      if not os.path.exists(image_path):
          print(image_path)
      try:
          image = self.image_process(image_path)
      except:
          count += 1
          #print('image has problem!')
          image_path_fail = os.path.join(self.img_path, '17_06_4705.jpg')
          image = self.image_process(image_path_fail)

      data = {
            'text_raw_indices': text_raw_indices,
            'text_raw_without_aspect_indices': text_raw_without_aspect_indices,
            # 'text_left_indices': text_left_indices,
            'text_left_indicator': text_left_indicator[:self.maxlen],
            # 'text_left_with_aspect_indices': text_left_with_aspect_indices,
            # 'text_right_indices': text_right_indices,
            'text_right_indicator': text_right_indicator[:self.maxlen],
            # 'text_right_with_aspect_indices': text_right_with_aspect_indices,
            'aspect_indices': aspect_indices[:self.maxlen],
            'polarity': label,
            'image': image,
        }
      self.data.append(data)
    print('the number of problematic samples: '+ str(count))


# Dataset debug单元测试

In [7]:
# from argparse import Namespace
# opt = Namespace()
# opt.crop_size = 224
# opt.embed_dim = 100
# opt.max_seq_len=100
# dataset = Twitter15Dataset('dev', opt)

In [8]:
# dataset.show_sample(1)

## Model

In [9]:
from layers.lstm import MyLSTM
from layers.attention import Attention
import torch
import torch.nn as nn
import torch.nn.functional as F


class MMFUSION(nn.Module):
    def __init__(self, embedding_matrix, opt):
        super(MMFUSION, self).__init__()
        self.opt = opt
        self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))

        self.lstm_aspect = MyLSTM(opt.embed_dim, opt.hidden_dim, dropout = opt.dropout_rate)
        self.lstm_l = MyLSTM(opt.embed_dim, opt.hidden_dim, dropout = opt.dropout_rate)
        self.lstm_r = MyLSTM(opt.embed_dim, opt.hidden_dim, dropout = opt.dropout_rate)

        self.attention_l = Attention(opt.hidden_dim, dropout=opt.dropout_rate)
        self.attention_r = Attention(opt.hidden_dim, dropout=opt.dropout_rate)

        self.ltext2hidden = nn.Linear(opt.hidden_dim, opt.hidden_dim)
        self.laspect2hidden = nn.Linear(opt.hidden_dim, opt.hidden_dim)

        self.rtext2hidden = nn.Linear(opt.hidden_dim, opt.hidden_dim)
        self.raspect2hidden = nn.Linear(opt.hidden_dim, opt.hidden_dim)

        self.dropout = nn.Dropout(self.opt.dropout_rate)

        self.aspect2text = nn.Linear(opt.hidden_dim, opt.hidden_dim)
        self.vismap2text = nn.Linear(2048, opt.hidden_dim)
        self.vis2text = nn.Linear(2048, opt.hidden_dim)

        self.gate = nn.Linear(2048+4*opt.hidden_dim, opt.hidden_dim)

        self.madality_attetion = nn.Linear(opt.hidden_dim,1)


        self.text2hiddenvis = nn.Linear(opt.hidden_dim * 4, opt.hidden_dim)
        self.vis2hiddenvis = nn.Linear(opt.hidden_dim, opt.hidden_dim)

        self.dense_2 = nn.Linear(opt.hidden_dim*2, opt.polarities_dim)
        self.dense_3 = nn.Linear(opt.hidden_dim*3, opt.polarities_dim)
        self.dense_4 = nn.Linear(opt.hidden_dim*4, opt.polarities_dim)
        self.dense_5 = nn.Linear(opt.hidden_dim*5, opt.polarities_dim)
        self.dense_6 = nn.Linear(opt.hidden_dim*6, opt.polarities_dim)
        self.dense_10 = nn.Linear(opt.hidden_dim*10, opt.polarities_dim)


    def attention_linear(self, text, converted_vis_embed_map, vis_embed_map):
        #text: batch_size, hidden_dim; converted_vis_embed_map: batch_size, keys_number,embed_size; vis_embed_map: batch_size, keys_number, 2048
        keys_size = converted_vis_embed_map.size(1)
        text = text.unsqueeze(1).expand(-1, keys_size, -1)#batch_size, keys_number,hidden_dim
        attention_inputs = torch.tanh(text + converted_vis_embed_map)
        #attention_inputs = F.dropout( attention_inputs )
        att_weights = self.madality_attetion(attention_inputs).squeeze(2) #batch_size, keys_number
        att_weights = F.softmax(att_weights, dim=-1).view(-1,1,49) #batch_size, 1, keys_number

        att_vector = torch.bmm(att_weights, vis_embed_map).view(-1, 2048) #batch_size, 2048
        return att_vector, att_weights

    def low_rank_pooling(self, final, aspect, text2hidden, aspect2hidden):
        text = torch.tanh(text2hidden(final))
        aspect = torch.tanh(aspect2hidden(aspect))
        text_aspect_inter = torch.mul(text, aspect)
        output = torch.cat((text_aspect_inter, final), dim=-1)
        return output

    def forward(self, inputs, visual_embeds_global, visual_embeds_att, att_mode):
        x_l, x_r, aspect_indices = inputs[0], inputs[1], inputs[2]
        ori_x_l_len = torch.sum(x_l != 0, dim=-1)
        ori_x_r_len = torch.sum(x_r != 0, dim=-1)
        ori_aspect_len = torch.sum(aspect_indices != 0, dim=-1)

        aspect = self.embed(aspect_indices)
        aspect_lstm, (_, _) = self.lstm_aspect(aspect, ori_aspect_len)
        aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
        sum_aspect = torch.sum(aspect_lstm, dim=1)
        avg_aspect = torch.div(sum_aspect, aspect_len.view(aspect_len.size(0), 1))

        # obtain the lstm hidden states for the left context and the right context respectively
        x_l, x_r = self.embed(x_l), self.embed(x_r)
        l_context, (_, _) = self.lstm_l(x_l, ori_x_l_len)
        r_context, (_, _) = self.lstm_r(x_r, ori_x_r_len)

        converted_vis_embed = self.vis2text(torch.tanh(visual_embeds_global))

        if att_mode == 'ESTR': # apply aspect words to attend the left and right contexts
            l_mid, l_att = self.attention_l(l_context, avg_aspect, ori_x_l_len)
            r_mid, r_att = self.attention_r(r_context, avg_aspect, ori_x_r_len)
            l_final = l_mid.squeeze(dim=1)
            r_final = r_mid.squeeze(dim=1)

            # low-rank pooling
            l_output = self.low_rank_pooling(l_final, avg_aspect, self.laspect2hidden, self.laspect2hidden)
            r_output = self.low_rank_pooling(r_final, avg_aspect, self.raspect2hidden, self.raspect2hidden)

            text_representation = torch.cat((l_output, r_output), dim=-1)
            x = text_representation
            x = self.dropout(x)
            out = self.dense_4(x)
            #"""
            return out

        elif att_mode == 'ResTarget': #only use image and aspect words
            vis_embed_map = visual_embeds_att.view(-1, 2048, 49).permute(0, 2, 1)#self.batch_size, 49, 2048
            converted_vis_embed_map = self.vismap2text(vis_embed_map) #self.batch_size, 49, embed
            converted_aspect = self.aspect2text(avg_aspect)

            #att_vector: batch_size, 2048
            att_vector, att_weights = self.attention_linear(converted_aspect, converted_vis_embed_map, vis_embed_map)
            converted_att_vis_embed = self.vis2text(torch.tanh(att_vector))
            x = torch.cat((avg_aspect, converted_att_vis_embed), dim=-1)
            out = self.dense_2(x)
            return out

        elif att_mode == 'without_ESVR': # "text" mode concatenated with image
            l_mid, l_att = self.attention_l(l_context, avg_aspect, ori_x_l_len)
            r_mid, r_att = self.attention_r(r_context, avg_aspect, ori_x_r_len)
            l_final = l_mid.squeeze(dim=1)
            r_final = r_mid.squeeze(dim=1)

            #"""
            # low-rank pooling
            l_output = self.low_rank_pooling(l_final, avg_aspect, self.laspect2hidden, self.laspect2hidden)
            r_output = self.low_rank_pooling(r_final, avg_aspect, self.raspect2hidden, self.raspect2hidden)
            text_representation = torch.cat((l_output, r_output), dim=-1)

            x = torch.cat((text_representation, converted_vis_embed), dim=-1)
            x = self.dropout(x)
            out = self.dense_5(x)
            return out
        elif att_mode == 'without_gate':  # "text" mode concatenated with attention-based image
            l_mid, l_att = self.attention_l(l_context, avg_aspect, ori_x_l_len)
            r_mid, r_att = self.attention_r(r_context, avg_aspect, ori_x_r_len)
            l_final = l_mid.squeeze(dim=1)
            r_final = r_mid.squeeze(dim=1)

            #"""
            # low-rank pooling
            l_output = self.low_rank_pooling(l_final, avg_aspect, self.laspect2hidden, self.laspect2hidden)
            r_output = self.low_rank_pooling(r_final, avg_aspect, self.raspect2hidden, self.raspect2hidden)
            text_representation = torch.cat((l_output, r_output), dim=-1)

            vis_embed_map = visual_embeds_att.view(-1, 2048, 49).permute(0, 2, 1)  # self.batch_size, 49, 2048
            converted_vis_embed_map = self.vismap2text(vis_embed_map)  # self.batch_size, 49, embed
            converted_aspect = self.aspect2text(avg_aspect)

            # att_vector: batch_size, 2048
            att_vector, att_weights = self.attention_linear(converted_aspect, converted_vis_embed_map, vis_embed_map)
            converted_att_vis_embed = self.vis2text(torch.tanh(att_vector))

            x = torch.cat((text_representation, converted_att_vis_embed), dim=-1)
            x = self.dropout(x)

            out = self.dense_5(x)
            return out
        elif att_mode == 'without_MFL':
            l_mid, l_att = self.attention_l(l_context, avg_aspect, ori_x_l_len)
            r_mid, r_att = self.attention_r(r_context, avg_aspect, ori_x_r_len)
            l_final = l_mid.squeeze(dim=1)
            r_final = r_mid.squeeze(dim=1)

            # low-rank pooling
            l_output = self.low_rank_pooling(l_final, avg_aspect, self.laspect2hidden, self.laspect2hidden)
            r_output = self.low_rank_pooling(r_final, avg_aspect, self.raspect2hidden, self.raspect2hidden)
            text_representation = torch.cat((l_output, r_output), dim=-1)


            # apply entity-based attention mechanism to obtain different image representations
            vis_embed_map = visual_embeds_att.view(-1, 2048, 49).permute(0, 2, 1)  # self.batch_size, 49, 2048
            converted_vis_embed_map = self.vismap2text(vis_embed_map)  # self.batch_size, 49, embed
            converted_aspect = self.aspect2text(avg_aspect)

            # att_vector: batch_size, 2048
            att_vector, att_weights = self.attention_linear(converted_aspect, converted_vis_embed_map, vis_embed_map)
            converted_att_vis_embed = self.vis2text(torch.tanh(att_vector))  # att_vector: batch_size, hidden_dim

            merge_representation = torch.cat((text_representation, att_vector), dim=-1)
            gate_value = torch.sigmoid(self.gate(merge_representation))  # batch_size, hidden_dim
            gated_converted_att_vis_embed = torch.mul(gate_value, converted_att_vis_embed)
            #gated_converted_att_vis_embed = self.dropout(gated_converted_att_vis_embed)

            vis_output = gated_converted_att_vis_embed
            # vis_output = gated_converted_att_vis_embed + text_vis_inter
            x = torch.cat((text_representation, vis_output), dim=-1)
            x = self.dropout(x)
            out = self.dense_5(x)

            return out
        elif att_mode == 'ESAFN':  # "text" mode concatenated with gated attention-based image
            l_mid, l_att = self.attention_l(l_context, avg_aspect, ori_x_l_len)
            r_mid, r_att = self.attention_r(r_context, avg_aspect, ori_x_r_len)
            l_final = l_mid.squeeze(dim=1)
            r_final = r_mid.squeeze(dim=1)

            #"""
            # low-rank pooling
            l_output = self.low_rank_pooling(l_final, avg_aspect, self.laspect2hidden, self.laspect2hidden)
            r_output = self.low_rank_pooling(r_final, avg_aspect, self.raspect2hidden, self.raspect2hidden)
            text_representation = torch.cat((l_output, r_output), dim=-1)
            #"""

            # apply entity-based attention mechanism to obtain different image representations
            vis_embed_map = visual_embeds_att.view(-1, 2048, 49).permute(0, 2, 1)  # self.batch_size, 49, 2048
            converted_vis_embed_map = self.vismap2text(vis_embed_map)  # self.batch_size, 49, embed
            converted_aspect = self.aspect2text(avg_aspect)

            # att_vector: batch_size, 2048
            att_vector, att_weights = self.attention_linear(converted_aspect, converted_vis_embed_map, vis_embed_map)
            converted_att_vis_embed = self.vis2text(torch.tanh(att_vector))  # att_vector: batch_size, hidden_dim

            merge_representation = torch.cat((text_representation, att_vector), dim=-1)
            gate_value = torch.sigmoid(self.gate(merge_representation))  # batch_size, hidden_dim
            gated_converted_att_vis_embed = torch.mul(gate_value, converted_att_vis_embed)
            #gated_converted_att_vis_embed = self.dropout(gated_converted_att_vis_embed)

            #"""
            text_vis = torch.tanh(self.text2hiddenvis(text_representation))  # batch_size, hidde_dim
            vis_vis = torch.tanh(self.vis2hiddenvis(gated_converted_att_vis_embed))  # batch_size, hidden_dim

            text_vis_inter = torch.mul(text_vis, vis_vis)
            vis_output = torch.cat((gated_converted_att_vis_embed, text_vis_inter), dim=-1)

            x = torch.cat((text_representation, vis_output), dim=-1)
            x = self.dropout(x)
            out = self.dense_6(x)
            #"""

            return out

# Trainer

In [10]:
from resnet.resnet import resnet152
from sklearn.metrics import classification_report, accuracy_score, f1_score
import gc
class Trainer:
  def __init__(self, opt):
    self.opt = opt
    print('> training arguments:')
    for arg in vars(opt):
        print('>>> {0}: {1}'.format(arg, getattr(opt, arg)))

    # use class attribute to avoid repeat load
    # def __init__(self, split, opt):
    self.train_dataset = Twitter15Dataset('train', opt)
    self.dev_dataset = Twitter15Dataset('dev', opt)
    self.test_dataset = Twitter15Dataset('test', opt)

    self.train_data_loader = DataLoader(dataset=self.train_dataset, batch_size=opt.batch_size, shuffle=True, collate_fn=self.train_dataset.collate_fn)
    self.dev_data_loader = DataLoader(dataset=self.dev_dataset, batch_size=opt.batch_size, shuffle=False, collate_fn=self.dev_dataset.collate_fn)
    self.test_data_loader = DataLoader(dataset=self.test_dataset, batch_size=opt.batch_size, shuffle=False, collate_fn=self.test_dataset.collate_fn)

    resnet = resnet152()
    resnet.load_state_dict(torch.load(os.path.join('resnet/resnet152.pth')))
    self.encoder = myResnet(resnet, self.opt.att_mode, self.opt.device).to(device)
    if not opt.fine_tune_cnn:
      self.encoder.requires_grad_(False)
    self.model = MMFUSION(self.train_dataset.embedding_matrix, opt).to(device)
    self.reset_parameters()

    torch.cuda.empty_cache()
    gc.collect()

  def reset_parameters(self):
    n_trainable_params, n_nontrainable_params = 0, 0
    for n, p in self.model.named_parameters():
      n_params = torch.prod(torch.tensor(p.shape))
      if p.requires_grad:
        n_trainable_params += n_params
        if len(p.shape) > 1:
          torch.nn.init.xavier_uniform_(p)
      else:
        n_nontrainable_params += n_params
        print(n)
    print('n_trainable_params: {0}, n_nontrainable_params: {1}'.format(n_trainable_params, n_nontrainable_params))

  def run_model(self, sample_batched):
    inputs = [sample_batched[col].to(device) for col in self.opt.inputs_cols]
    images = sample_batched['image'].to(device)

    imgs_f, _, img_att = self.encoder(images)
    outputs = self.model(inputs, imgs_f, img_att, self.opt.att_mode)
    return outputs

  def record_track_list(self, track_list, loss, outputs, targets):
    track_list['losses'].append(loss.item())
    track_list['preds'].extend(outputs.argmax(-1).tolist())
    track_list['targets'].extend(targets.tolist())

  @torch.no_grad()
  def evaluate(self, eval_on='dev', print_results=True):
    self.model.eval()
    self.encoder.eval()
    print(f'Start evaluation on {eval_on}')

    criterion = nn.CrossEntropyLoss()
    track_list = {
        'losses': [],
        'preds': [],
        'targets': [],
    }

    if eval_on == 'dev':
      data_loader = self.dev_data_loader
    elif eval_on == 'test':
      data_loader = self.test_data_loader
    else:
      raise ValueError(eval_on)

    for i_batch, sample_batched in enumerate(data_loader):
      targets = sample_batched['polarity'].to(device)
      outputs = self.run_model(sample_batched)
      loss = criterion(outputs, targets)
      self.record_track_list(track_list, loss, outputs, targets)

    if print_results:
      print(classification_report(y_pred=track_list['preds'],
                                  y_true=track_list['targets'],
                                  digits=4))
    return track_list


  def train(self):
    criterion = nn.CrossEntropyLoss()
    params = [p for p in self.model.parameters() if p.requires_grad] + \
             [p for p in self.encoder.parameters() if p.requires_grad]
    optimizer = torch.optim.AdamW(params, lr=self.opt.learning_rate)

    max_dev_acc = 0.
    max_test_acc = max_test_f1 = 0.
    global_step = 0
    track_list = {
        'losses': [],
        'preds': [],
        'targets': [],
    }

    for epoch in range(self.opt.num_epoch):
      print('>' * 100)
      print('epoch: ', epoch)
      n_correct, n_total = 0, 0

      for i_batch, sample_batched in enumerate(tqdm.tqdm(self.train_data_loader)):
        self.model.train()
        self.encoder.train()
        global_step += 1
        optimizer.zero_grad()

        targets = sample_batched['polarity'].to(device)
        outputs = self.run_model(sample_batched)
        loss = criterion(outputs, targets)
        loss.backward()
        nn.utils.clip_grad_norm_(params, 5.)
        optimizer.step()
        self.record_track_list(track_list, loss, outputs, targets)

        if global_step % self.opt.log_step == 0:
          print('\n===== Metrics on train: =====')
          print(f'Loss: {sum(track_list["losses"]) / len(track_list["losses"])}')
          # print(classification_report(y_pred=track_list['preds'],
          #                             y_true=track_list['targets'],
          #                             digits=4))

          track_list_dev = self.evaluate('dev', False)

          print('\n===== Metrics on dev: =====')
          print(f'Loss: {sum(track_list_dev["losses"]) / len(track_list_dev["losses"])}')
          print(f"Acc: {accuracy_score(y_pred=track_list_dev['preds'], y_true=track_list_dev['targets'])}")
          print(f"Macro_f1: {f1_score(y_pred=track_list_dev['preds'], y_true=track_list_dev['targets'], average='macro')}")
          # print(classification_report(y_pred=track_list_dev['preds'],
          #                             y_true=track_list_dev['targets'],
          #                             digits=4))

          track_list_test = self.evaluate('test', False)

          print('\n===== Metrics on test: =====')
          print(f'Loss: {sum(track_list_test["losses"]) / len(track_list_test["losses"])}')
          print(f"Acc: {accuracy_score(y_pred=track_list_test['preds'], y_true=track_list_test['targets'])}")
          print(f"Macro_f1: {f1_score(y_pred=track_list_test['preds'], y_true=track_list_test['targets'], average='macro')}")
          # print(classification_report(y_pred=track_list_test['preds'],
          #                             y_true=track_list_test['targets'],
          #                             digits=4))

          dev_acc = accuracy_score(y_pred=track_list_dev['preds'],
                                  y_true=track_list_dev['targets'],)

          if dev_acc > max_dev_acc:
            max_dev_acc = dev_acc
            max_test_acc = accuracy_score(y_pred=track_list_test['preds'], y_true=track_list_test['targets'])
            max_test_f1 = f1_score(y_pred=track_list_test['preds'], y_true=track_list_test['targets'], average='macro')
            print(f"Save the best model with dev_acc = {dev_acc}")
            torch.save({
                'state_dict': self.model.state_dict(),
                'encode_dict': self.encoder.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, f'best_checkpoint_epoch{epoch}.pth')
      print(f"max_test_acc={max_test_acc}, max_test_f1={max_test_f1}")


In [None]:
from argparse import Namespace
import numpy as np
opt = Namespace()
opt.crop_size = 224
opt.embed_dim = 100
opt.hidden_dim = 100
opt.fine_tune_cnn = False
opt.dropout_rate = 0.5
opt.learning_rate = 0.001
opt.num_epoch = 6
opt.batch_size = 10
opt.att_mode = 'ESTR'
opt.device = 'cuda'
opt.polarities_dim = 3
opt.max_seq_len = 100
opt.log_step = 50
opt.inputs_cols = ['text_left_indicator', 'text_right_indicator', 'aspect_indices']
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

trainer = Trainer(opt)
trainer.train()

> training arguments:
>>> crop_size: 224
>>> embed_dim: 100
>>> hidden_dim: 100
>>> fine_tune_cnn: False
>>> dropout_rate: 0.5
>>> learning_rate: 0.001
>>> num_epoch: 6
>>> batch_size: 10
>>> att_mode: ESTR
>>> device: cuda
>>> tfn: False
>>> polarities_dim: 3
>>> max_seq_len: 100
>>> log_step: 50
>>> inputs_cols: ['text_left_indicator', 'text_right_indicator', 'aspect_indices']
loading word vectors...
building embedding_matrix: 100_twitter15_embedding_matrix.dat
--------------./datasets/twitter2015/train.txt---------------


100%|██████████| 3179/3179 [00:25<00:00, 125.14it/s]


the number of problematic samples: 82
--------------./datasets/twitter2015/dev.txt---------------


100%|██████████| 1122/1122 [00:09<00:00, 121.43it/s]


the number of problematic samples: 30
--------------./datasets/twitter2015/test.txt---------------


100%|██████████| 1037/1037 [00:07<00:00, 136.60it/s]


the number of problematic samples: 27




embed.weight
n_trainable_params: 1207319, n_nontrainable_params: 1290500
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch:  0


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 15%|█▌        | 49/318 [00:09<00:26, 10.18it/s]


===== Metrics on train: =====
Loss: 0.9670945143699646
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.9153892656343173
Acc: 0.6096256684491979
Macro_f1: 0.30543837795116807
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)



===== Metrics on test: =====
Loss: 0.898076495872094
Acc: 0.5969141755062681
Macro_f1: 0.2936680517082179
Save the best model with dev_acc = 0.6096256684491979


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 31%|███       | 99/318 [00:32<00:20, 10.45it/s]


===== Metrics on train: =====
Loss: 0.9419023025035859
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.9013284645249359
Acc: 0.6060606060606061
Macro_f1: 0.41535115968772923
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 32%|███▏      | 102/318 [00:50<07:57,  2.21s/it]


===== Metrics on test: =====
Loss: 0.8744753146400819
Acc: 0.6113789778206364
Macro_f1: 0.41320572290629737


 47%|████▋     | 148/318 [00:54<00:16, 10.33it/s]


===== Metrics on train: =====
Loss: 0.9249367839097977
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.8030256265032608
Acc: 0.6550802139037433
Macro_f1: 0.4216982334964237
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 47%|████▋     | 148/318 [01:07<00:16, 10.33it/s]


===== Metrics on test: =====
Loss: 0.7823905878915236
Acc: 0.6721311475409836
Macro_f1: 0.4384518275338875
Save the best model with dev_acc = 0.6550802139037433


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 63%|██████▎   | 199/318 [01:19<00:11, 10.45it/s]


===== Metrics on train: =====
Loss: 0.8849504789710045
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7869640183923519
Acc: 0.64349376114082
Macro_f1: 0.42896760795339045
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 63%|██████▎   | 201/318 [01:37<05:30,  2.83s/it]


===== Metrics on test: =====
Loss: 0.7607840759536395
Acc: 0.6692381870781099
Macro_f1: 0.4660059086627583


 78%|███████▊  | 249/318 [01:42<00:07,  9.73it/s]


===== Metrics on train: =====
Loss: 0.8631223950386048
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7764968372947347
Acc: 0.6595365418894831
Macro_f1: 0.43252355978861495
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)



===== Metrics on test: =====
Loss: 0.7570441170380666
Acc: 0.6750241080038573
Macro_f1: 0.4380849299430283
Save the best model with dev_acc = 0.6595365418894831


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 94%|█████████▎| 298/318 [02:05<00:01, 10.56it/s]


===== Metrics on train: =====
Loss: 0.8479341727495193
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7533417214862014
Acc: 0.6666666666666666
Macro_f1: 0.4824773162407392
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 94%|█████████▎| 298/318 [02:17<00:01, 10.56it/s]


===== Metrics on test: =====
Loss: 0.7129008322954178
Acc: 0.7010607521697203
Macro_f1: 0.5296060390781135
Save the best model with dev_acc = 0.6666666666666666


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
100%|██████████| 318/318 [02:26<00:00,  2.18it/s]


max_test_acc=0.7010607521697203, max_test_f1=0.5296060390781135
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch:  1


  9%|▉         | 30/318 [00:02<00:27, 10.46it/s]


===== Metrics on train: =====
Loss: 0.8235619186503547
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7864071697787901
Acc: 0.661319073083779
Macro_f1: 0.4163527044211514
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 10%|█         | 32/318 [00:21<13:22,  2.81s/it]


===== Metrics on test: =====
Loss: 0.7559776379225346
Acc: 0.6711668273866924
Macro_f1: 0.4228227584862558


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 25%|██▌       | 80/318 [00:25<00:22, 10.37it/s]


===== Metrics on train: =====
Loss: 0.8084261029213667
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7631373608534315
Acc: 0.6586452762923352
Macro_f1: 0.5293446756861391
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 26%|██▌       | 83/318 [00:44<09:28,  2.42s/it]


===== Metrics on test: =====
Loss: 0.7582512874729358
Acc: 0.6682738669238187
Macro_f1: 0.5485564014343761


 41%|████      | 131/318 [00:48<00:17, 10.42it/s]


===== Metrics on train: =====
Loss: 0.7980495541625553
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.750130610787763
Acc: 0.6648841354723708
Macro_f1: 0.4865342638644106
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 42%|████▏     | 133/318 [01:07<08:05,  2.63s/it]


===== Metrics on test: =====
Loss: 0.7043721072662336
Acc: 0.7029893924783028
Macro_f1: 0.5466378000111476


 57%|█████▋    | 180/318 [01:11<00:13, 10.48it/s]


===== Metrics on train: =====
Loss: 0.7910173921585083
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.738279866697514
Acc: 0.661319073083779
Macro_f1: 0.5264426168741365
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 58%|█████▊    | 184/318 [01:29<04:25,  1.98s/it]


===== Metrics on test: =====
Loss: 0.7086128901976806
Acc: 0.703953712632594
Macro_f1: 0.58473411154345


 73%|███████▎  | 231/318 [01:34<00:09,  9.50it/s]


===== Metrics on train: =====
Loss: 0.7850897190245715
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7241386710542493
Acc: 0.6657754010695187
Macro_f1: 0.532101584446823
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 73%|███████▎  | 233/318 [01:52<05:30,  3.88s/it]


===== Metrics on test: =====
Loss: 0.6946076426941615
Acc: 0.7087753134040502
Macro_f1: 0.5767688562863749


 88%|████████▊ | 280/318 [01:57<00:03, 10.42it/s]


===== Metrics on train: =====
Loss: 0.7773364414771398
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7361252735805722
Acc: 0.6657754010695187
Macro_f1: 0.5280332248142434
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 89%|████████▉ | 284/318 [02:15<01:07,  1.99s/it]


===== Metrics on test: =====
Loss: 0.712865736335516
Acc: 0.7174541947926711
Macro_f1: 0.6072724546408756


100%|██████████| 318/318 [02:19<00:00,  2.29it/s]


max_test_acc=0.7010607521697203, max_test_f1=0.5296060390781135
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch:  2


  4%|▍         | 12/318 [00:01<00:28, 10.56it/s]


===== Metrics on train: =====
Loss: 0.7735127216119032
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7265827965024299
Acc: 0.6773618538324421
Macro_f1: 0.519167943689482
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  4%|▍         | 12/318 [00:12<00:28, 10.56it/s]


===== Metrics on test: =====
Loss: 0.6884776803736503
Acc: 0.7145612343297975
Macro_f1: 0.5597297791569548
Save the best model with dev_acc = 0.6773618538324421


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 20%|█▉        | 63/318 [00:24<00:24, 10.47it/s]


===== Metrics on train: =====
Loss: 0.7626343347132206
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7516711469640774
Acc: 0.6639928698752228
Macro_f1: 0.5311173182458644
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 21%|██        | 66/318 [00:42<09:19,  2.22s/it]


===== Metrics on test: =====
Loss: 0.7024179780139372
Acc: 0.7107039537126326
Macro_f1: 0.5864940113025824


 36%|███▌      | 113/318 [00:47<00:20,  9.88it/s]


===== Metrics on train: =====
Loss: 0.7524060492714246
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7834237273287984
Acc: 0.6559714795008913
Macro_f1: 0.5693805145490706
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 36%|███▌      | 115/318 [01:05<09:53,  2.92s/it]


===== Metrics on test: =====
Loss: 0.7143452442609347
Acc: 0.6885245901639344
Macro_f1: 0.5996177521982684


 51%|█████▏    | 163/318 [01:10<00:14, 10.40it/s]


===== Metrics on train: =====
Loss: 0.7475429455563426
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7681232088171275
Acc: 0.661319073083779
Macro_f1: 0.5227570470017728
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 52%|█████▏    | 165/318 [01:28<06:43,  2.64s/it]


===== Metrics on test: =====
Loss: 0.7024466753579103
Acc: 0.7174541947926711
Macro_f1: 0.6053283093243013


 67%|██████▋   | 212/318 [01:33<00:10, 10.43it/s]


===== Metrics on train: =====
Loss: 0.7423529245397624
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7362536153039047
Acc: 0.6818181818181818
Macro_f1: 0.5483307538277759
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)



===== Metrics on test: =====
Loss: 0.6840448860938733
Acc: 0.703953712632594
Macro_f1: 0.586644926589122
Save the best model with dev_acc = 0.6818181818181818


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 83%|████████▎ | 263/318 [01:56<00:05,  9.51it/s]


===== Metrics on train: =====
Loss: 0.7335537601510683
Start evaluation on dev

===== Metrics on dev: =====
Loss: 0.7309911174584279
Acc: 0.6934046345811051
Macro_f1: 0.5865894997022782
Start evaluation on test


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)



===== Metrics on test: =====
Loss: 0.6968628026258487
Acc: 0.7261330761812922
Macro_f1: 0.6329748030851531
Save the best model with dev_acc = 0.6934046345811051


  aspect_len = torch.tensor(ori_aspect_len, dtype=torch.float).to(self.opt.device)
 87%|████████▋ | 278/318 [02:17<00:13,  2.97it/s]