In [31]:
from PIL import Image
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split

from transformers import AdamW, get_linear_schedule_with_warmup, BertForMaskedLM, BertConfig, BertModel, BertTokenizer

In [32]:
with open('./train.txt', 'r') as f:
  lines = f.readlines()

train_set = []

for line in lines[1:]:
  data = {}
  line = line.replace('\n','')
  guid, tag = line.split(',')
  if tag == 'positive':
    label = 0
  elif tag == 'neutral':
    label = 1
  else:
    label = 2
  data['guid'] = guid
  data['label'] = label
  train_set.append(data)

print(len(train_set)) # 4000
# print(train_set)

4000


In [33]:
with open('./test_without_label.txt', 'r') as f:
  lines = f.readlines()

test_set = []
for line in lines[1:]:
  data = {}
  data['guid'] = line.split(',')[0]
  test_set.append(data)

In [34]:
def data_process(dataset):
  for data in dataset:
    guid = data['guid']
    image_path = './data/' + guid + '.jpg'
    image = Image.open(image_path).convert('RGB')
    array = np.array(image.resize((224, 224)))
    data['image'] = array.reshape((3, 224, 224))

    text_path = './data/' + guid + '.txt'
    f = open(text_path, 'r', errors='ignore')
    lines = f.readlines()
    # print(lines)
    text = ''
    for line in lines:
      text += line
    data['text'] = text

In [35]:
data_process(train_set)
data_process(test_set)

In [36]:
train_set_num = 3500
valid_set_num = 500
train_set, valid_set = random_split(train_set, [train_set_num, valid_set_num])

In [37]:
__all__ = ['ResNet50', 'ResNet101','ResNet152']

def Conv1(in_planes, places, stride=2):
    return nn.Sequential(
        nn.Conv2d(in_channels=in_planes,out_channels=places,kernel_size=7,stride=stride,padding=3, bias=False),
        nn.BatchNorm2d(places),
        nn.ReLU(inplace=True),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )

class Bottleneck(nn.Module):
    def __init__(self,in_places,places, stride=1,downsampling=False, expansion = 4):
        super(Bottleneck,self).__init__()
        self.expansion = expansion
        self.downsampling = downsampling

        self.bottleneck = nn.Sequential(
            nn.Conv2d(in_channels=in_places,out_channels=places,kernel_size=1,stride=1, bias=False),
            nn.BatchNorm2d(places),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=places, out_channels=places, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(places),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=places, out_channels=places*self.expansion, kernel_size=1, stride=1, bias=False),
            nn.BatchNorm2d(places*self.expansion),
        )

        if self.downsampling:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels=in_places, out_channels=places*self.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(places*self.expansion)
            )
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        residual = x
        out = self.bottleneck(x)

        if self.downsampling:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self,blocks, num_classes=3, expansion = 4):
        super(ResNet,self).__init__()
        self.expansion = expansion

        self.conv1 = Conv1(in_planes = 3, places= 64)

        self.layer1 = self.make_layer(in_places = 64, places= 64, block=blocks[0], stride=1)
        self.layer2 = self.make_layer(in_places = 256,places=128, block=blocks[1], stride=2)
        self.layer3 = self.make_layer(in_places=512,places=256, block=blocks[2], stride=2)
        self.layer4 = self.make_layer(in_places=1024,places=512, block=blocks[3], stride=2)

        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(2048,num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def make_layer(self, in_places, places, block, stride):
        layers = []
        layers.append(Bottleneck(in_places, places,stride, downsampling =True))
        for i in range(1, block):
            layers.append(Bottleneck(places*self.expansion, places))

        return nn.Sequential(*layers)


    def forward(self, x):
        x = self.conv1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

def ResNet50():
    return ResNet([3, 4, 6, 3])

def ResNet101():
    return ResNet([3, 4, 23, 3])

def ResNet152():
    return ResNet([3, 8, 36, 3])





In [38]:
image_train = []
image_train_labels = []
image_valid = []
image_valid_labels = []

for data in train_set:
  image_train.append(data['image'])
  image_train_labels.append(data['label'])

for data in valid_set:
  image_valid.append(data['image'])
  image_valid_labels.append(data['label'])

image_train = torch.from_numpy(np.array(image_train))
image_train_labels = torch.from_numpy(np.array(image_train_labels))
image_valid = torch.from_numpy(np.array(image_valid))
image_valid_labels = torch.from_numpy(np.array(image_valid_labels))

train_loader = DataLoader(TensorDataset(image_train, image_train_labels), batch_size=100, shuffle=True)
valid_loader = DataLoader(TensorDataset(image_valid, image_valid_labels), batch_size=50)

In [39]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [40]:
if __name__=='__main__':
    #model = torchvision.models.resnet50()
    image_model = ResNet50()
    image_model.to(device)

epoch_num = 10
learning_rate = 1e-5
total_step = epoch_num * len(train_loader)

optimizer = AdamW(image_model.parameters(), lr=learning_rate, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_step, num_training_steps=total_step)


# optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
criterion = nn.CrossEntropyLoss()

In [41]:
for epoch in range(epoch_num):
  running_loss = 0
  for i, data in enumerate(train_loader):
    inputs, labels = data
    inputs = inputs.float()
    inputs = inputs.to(device)
    labels = labels.to(device)
    # print(inputs.shape)
    outputs = image_model(inputs)
    # print(outputs.shape)
    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()
    running_loss += loss.item()
  print('epoch: %d  loss: %.3f' % (epoch+1, running_loss / 35))
  running_loss = 0

correct_num = 0
total_num = 0

with torch.no_grad():
  for data in valid_loader:
    inputs, answers = data
    inputs = inputs.float()
    inputs = inputs.to(device)
    answers = answers.to(device)
    outputs = image_model(inputs)
    _, predicted = torch.max(outputs.data, 1)
    for i in range(len(predicted.tolist())):
      total_num += answers.size(0)
      correct_num += (predicted == answers).sum().item()

print('Training Accuracy: %.3f%%' % (100 * correct_num / total_num))

epoch: 1  loss: 1.237
epoch: 2  loss: 0.933
epoch: 3  loss: 0.893
epoch: 4  loss: 0.888
epoch: 5  loss: 0.885
epoch: 6  loss: 0.882
epoch: 7  loss: 0.884
epoch: 8  loss: 0.879
epoch: 9  loss: 0.877
epoch: 10  loss: 0.875
Training Accuracy: 56.600%


In [42]:
checkpoint = './bert_chinese'
tokenizer = BertTokenizer.from_pretrained(checkpoint)
config = BertConfig.from_pretrained(checkpoint, output_hidden_states = True, output_attentions=True)
assert config.output_hidden_states == True
assert config.output_attentions == True
# bert_model = BertModel.from_pretrained(checkpoint, config=config)
bert_model = BertForMaskedLM.from_pretrained(checkpoint, config=config)

Some weights of the model checkpoint at ./bert_chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
class TextClassifier(nn.Module):
  def __init__(self):
    super(TextClassifier, self).__init__()
    self.model = bert_model
    self.model = self.model.to(device)
    self.dropout = nn.Dropout(0)
    # self.model.to(device)
    # self.fc = nn.Linear(768, 3)
    self.fc = nn.Linear(98304, 3)
  
  def forward(self, x, attn_mask=None):
    x = x.to(device)
    attn_mask = attn_mask.to(device)
    output = self.model(x, attention_mask=attn_mask)
    output = output.hidden_states[-1]
    # print(output.shape)
    # output = output.logits
    
    output = torch.flatten(output, 1)
    output = self.fc(output)
    return output

In [44]:
text_train = []
text_valid = []

for data in train_set:
  tokenized_text = tokenizer(data['text'], max_length=128, padding='max_length', truncation=True)
  # tokenized_text['input_ids'] = torch.from_numpy(np.array(tokenized_text['input_ids']))
  tokenized_text['label'] = data['label']
  text_train.append(tokenized_text)

for data in valid_set:
  tokenized_text = tokenizer(data['text'], max_length=128, padding='max_length', truncation=True)
  tokenized_text['label'] = data['label']
  text_valid.append(tokenized_text)

In [45]:
class TextDataset(Dataset):
  def __init__(self, data):
    super(TextDataset, self).__init__()
    self.data = data
  
  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    input_ids = self.data[idx]['input_ids']
    attn_mask = self.data[idx]['attention_mask']
    label = self.data[idx]['label']
    return input_ids, attn_mask, label

train_loader = DataLoader(TextDataset(text_train), batch_size=25, shuffle=True)
valid_loader = DataLoader(TextDataset(text_valid), batch_size=25)

In [46]:
text_model = TextClassifier()
text_model.to(device)
# classifier.model.to(device)

epoch_num = 20
learning_rate = 1e-5
total_step = epoch_num * len(train_loader)

optimizer = AdamW(text_model.parameters(), lr=learning_rate, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_step, num_training_steps=total_step)

criterion = nn.CrossEntropyLoss()

In [47]:
# classifier.train()

for epoch in range(epoch_num):
  running_loss = 0
  for i, data in enumerate(train_loader):
    input_ids, attn_mask, labels = data
    input_ids = torch.tensor([item.numpy() for item in input_ids])
    attn_mask = torch.tensor([item.numpy() for item in attn_mask])
    input_ids = input_ids.T
    attn_mask = attn_mask.T
    # labels = torch.tensor([item.numpy() for item in labels])
    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)
    labels = labels.to(device)

    # print(input_ids.shape)
    # print(attn_mask.shape)

    outputs = text_model(input_ids, attn_mask)
    # outputs = bert_model(input_ids)

    loss = criterion(outputs, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()
    
    running_loss += loss.item()
  print('epoch: %d  loss: %.3f' % (epoch+1, running_loss/140))
  running_loss = 0
    

correct_num = 0
total_num = 0
with torch.no_grad():
  for data in valid_loader:
    input_ids, attn_mask, labels = data
    input_ids = torch.tensor([item.numpy() for item in input_ids])
    input_ids = input_ids.T
    attn_mask = torch.tensor([item.numpy() for item in attn_mask])
    attn_mask = attn_mask.T
    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)
    labels = labels.to(device)
    
    outputs = text_model(input_ids, attn_mask)
    _, predicted = torch.max(outputs.data, 1)
    for i in range(len(predicted.tolist())):
      total_num += labels.size(0)
      correct_num += (predicted == labels).sum().item()

print('Training Accuracy: %.3f%%' % (100 * correct_num / total_num))

epoch: 1  loss: 0.915
epoch: 2  loss: 0.800
epoch: 3  loss: 0.605
epoch: 4  loss: 0.317
epoch: 5  loss: 0.147
epoch: 6  loss: 0.097
epoch: 7  loss: 0.083
epoch: 8  loss: 0.060
epoch: 9  loss: 0.056
epoch: 10  loss: 0.047
epoch: 11  loss: 0.044
epoch: 12  loss: 0.040
epoch: 13  loss: 0.040
epoch: 14  loss: 0.038
epoch: 15  loss: 0.035
epoch: 16  loss: 0.033
epoch: 17  loss: 0.031
epoch: 18  loss: 0.031
epoch: 19  loss: 0.028
epoch: 20  loss: 0.027
Training Accuracy: 61.400%


In [48]:
class MultimodalDataset(Dataset):
  def __init__(self, data):
    super(MultimodalDataset, self).__init__()
    self.data = data

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx):
    guid = self.data[idx]['guid']
    input_ids = torch.tensor(self.data[idx]['input_ids'])
    attn_mask = torch.tensor(self.data[idx]['attn_mask'])
    image = torch.tensor(self.data[idx]['image'])
    label = self.data[idx].get('label')
    if label is None:
      label = -100
    label = torch.tensor(label)
    return guid, input_ids, attn_mask, image, label

In [49]:
def dataset_process(dataset):
  for data in dataset:
    tokenized_text = tokenizer(data['text'], max_length=128, padding='max_length', truncation=True)
    data['input_ids'] = tokenized_text['input_ids']
    data['attn_mask'] = tokenized_text['attention_mask']

In [50]:
dataset_process(train_set)
dataset_process(valid_set)
dataset_process(test_set)

In [51]:
train_loader = DataLoader(MultimodalDataset(train_set), batch_size=25, shuffle=True)
valid_loader = DataLoader(MultimodalDataset(valid_set), batch_size=25)
test_loader = DataLoader(MultimodalDataset(test_set), batch_size=25)

In [52]:
class MultimodalModel(nn.Module):
  def __init__(self, image_model, text_model, output_features, image_weight=0.5, text_weight=0.5):
    super(MultimodalModel, self).__init__()
    self.image_model = image_model
    self.text_model = text_model
    # 将最后的全连接层删除
    self.image_model.fc = nn.Sequential()  # (batch_num, 512)
    self.text_model.fc = nn.Sequential()    # (batch_num, 768)
    # 文本特征向量和图片特征向量的权重, 默认均为0.5
    self.image_weight = image_weight
    self.text_weight = text_weight
    self.fc1 = nn.Linear((4*512+768*128), output_features)
    self.fc2 = nn.Linear(output_features, 3)

  def forward(self, input_ids, attn_mask, image):
    image_output = self.image_model(image)
    text_output = self.text_model(input_ids, attn_mask)
    output = torch.cat([image_output, text_output], dim=-1)
    output = self.fc1(output)
    output = self.fc2(output)
    return output

In [53]:
multimodal_model = MultimodalModel(image_model=image_model, text_model=text_model, output_features=100, image_weight=0.5, text_weight=0.5)
multimodal_model.to(device)

epoch_num = 10
learning_rate = 1e-5
total_step = epoch_num * len(train_loader)

optimizer = AdamW(multimodal_model.parameters(), lr=learning_rate, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_step, num_training_steps=total_step)
criterion = nn.CrossEntropyLoss()

In [54]:
for epoch in range(epoch_num):
  running_loss = 0
  for i, data in enumerate(train_loader):
    _, input_ids, attn_mask, image, label = data
    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)
    image = image.to(device)
    image = image.float()
    label = label.to(device)

    outputs = multimodal_model(input_ids=input_ids, attn_mask=attn_mask, image=image)
    # print(outputs.shape)
    loss = criterion(outputs, label)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

    running_loss += loss.item()
  print('epoch: %d  loss: %.3f' % (epoch+1, running_loss/140))
  running_loss = 0
    
    
correct_num = 0
total_num = 0
with torch.no_grad():
  for data in valid_loader:
    _, input_ids, attn_mask, image, label = data
    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)
    image = image.to(device)
    image = image.float()
    label = label.to(device)
    
    outputs = multimodal_model(input_ids=input_ids, attn_mask=attn_mask, image=image)
    _, predicted = torch.max(outputs.data, 1)
    for i in range(len(predicted.tolist())):
      total_num += label.size(0)
      correct_num += (predicted == label).sum().item()

print('Training Accuracy: %.3f%%' % (100 * correct_num / total_num))

epoch: 1  loss: 0.203
epoch: 2  loss: 0.118
epoch: 3  loss: 0.099
epoch: 4  loss: 0.066
epoch: 5  loss: 0.065
epoch: 6  loss: 0.044
epoch: 7  loss: 0.036
epoch: 8  loss: 0.033
epoch: 9  loss: 0.030
epoch: 10  loss: 0.028
Training Accuracy: 60.400%


In [55]:
test_dict = {}
with torch.no_grad():
  for data in test_loader:
    guid, input_ids, attn_mask, image, label = data
    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)
    image = image.to(device)
    image = image.float()
    label = label.to(device)
    
    outputs = multimodal_model(input_ids=input_ids, attn_mask=attn_mask, image=image)
    _, predicted = torch.max(outputs.data, 1)
    predicted = predicted.tolist()
    for i in range(len(predicted)):
      id = guid[i]
      test_dict[id] = predicted[i]

In [29]:
with open('./test_without_label.txt', 'r') as f:
  lines = f.readlines()

f1 = open('./test.txt', 'w')
f1.write(lines[0])

for line in lines[1:]:
  # print(line)
  guid = line.split(',')[0]
  f1.write(guid)
  f1.write(',')
  label = test_dict[guid]
  if label == 0:
    f1.write('positive\n')
  elif label == 1:
    f1.write('neutral\n')
  else:
    f1.write('negative\n')