In [3]:
import json
import re
import torch
import heapq
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel, BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import f1_score
import sys
# from google.colab import drive

# drive.mount("/content/drive")


Mounted at /content/drive


In [4]:
def read_json(path):
    data = []
    with open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            data.append(json.loads(line))
    return data


In [5]:
class BaselineData(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.pad_size = config.pad_size
        self.label2id = config.label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = "[SEP]".join(
            [self.data[idx]["hashtag"]] + self.data[idx]["comments"]
        )

        input_ids, attention_mask = self.__convert_to_id__(sentence)

        if self.data[idx].get("attitudes"):
            label = self.__convert_label__(self.data[idx]["attitudes"])
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
                torch.tensor(label),
            )
        else:
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
            )

    def __convert_to_id__(self, sentence):
        ids = self.tokenizer.encode_plus(sentence)
        input_ids = self.__padding__(ids["input_ids"])
        attention_mask = self.__padding__(ids["attention_mask"])

        return input_ids, attention_mask

    def __convert_label__(self, label):
        onehot_label = [0] * 24
        for i in label:
            onehot_label[self.label2id[i]] = 1
        return onehot_label

    def __padding__(self, sentence):
        sentence = sentence[: self.pad_size]  # 长就截断
        sentence = sentence + [0] * (self.pad_size - len(sentence))  # 短就补充
        return sentence


In [6]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert = BertModel.from_pretrained(config.PTM)
        self.bert_config = BertConfig.from_pretrained(config.PTM)
        self.fc = nn.Linear(self.bert_config.hidden_size, self.bert_config.hidden_size)
        self.fc1 = nn.Linear(self.bert_config.hidden_size, config.label_num)
        self.act = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.bert(input_ids=x[0], attention_mask=x[1]).pooler_output
        x = self.sigmoid(self.fc1(self.act(self.fc(x))))
        return x

In [7]:
def train(config, dataset, model, optimizer, valid_dataset):
    max_scores = 0
    for epoch in range(config.epochs):
        with tqdm(total=len(dataset)) as pbar:
            for idx, data in enumerate(dataset):
                x = [data[0].long().to(config.device), data[1].long().to(config.device)]
                y = data[2].float().to(config.device)
                y_hat = model(x)
                loss = F.binary_cross_entropy(y_hat, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                pbar.set_postfix({'loss' : '{:.4f}'.format(loss)})
                pbar.update(1)
        scores = valid(config, valid_dataset, model)
        if scores >= max_scores:
            max_scores = scores
            saved_model = model
    return saved_model

def valid(config, dataset, model):
    true = []
    pred = []
    with torch.no_grad():
        for idx, data in enumerate(dataset):
            x = [data[0].long().to(config.device), data[1].long().to(config.device)]
            y = data[2].float().to(config.device).view(-1, 24).tolist()
            y_hat = model(x).view(-1, 24).tolist()
            true.extend(y)
            pred.extend(y_hat)

    pred = [[1 if i>=0.5 else 0 for i in j] for j in pred]

    micro_f1 = f1_score(pred, true, average='micro')
    macro_f1 = f1_score(pred, true, average='macro')
    print('micro_f1: {:.4f}'.format(micro_f1))
    print('macro_f1: {:.4f}'.format(macro_f1))
    return micro_f1*0.4+macro_f1*0.6

def find_three_emojis(m):
    max_number = heapq.nlargest(3, m)
    max_index = []
    for t in max_number:
        index = m.index(t)
        max_index.append(index)
    return max_index

def generate_test_result(config, dataset, model):
    with torch.no_grad():
        predict = []
        for idx, data in enumerate(dataset):
            x = [data[0].long().to(config.device), data[1].long().to(config.device)]
            predict.extend(model(x).view(-1, 24).tolist())
    with open('/content/drive/MyDrive/FYP_Dataset/submit.txt', 'w', encoding='utf-8') as f:
        for i in range(len(predict)):
            line = []
            index = find_three_emojis(predict[i])
            for j in index:
                line.append(config.id2label[j])
            f.write(' '.join([str(i)]+line))
            f.write('\n')

In [8]:
class Config():
    def __init__(self):
        self.pad_size = 500
        self.batch_size = 24
        #24
        self.epochs = 4
        #15
        self.PTM = 'bert-base-chinese'
        self.label_num = 24
        self.device = 'cuda:0'
        self.lr = 5e-5
        label_dic = ['[微笑]', '[嘻嘻]', '[笑cry]', '[怒]', '[泪]', '[允悲]', '[憧憬]', '[doge]', '[并不简单]', '[思考]', '[费解]', '[吃惊]', '[拜拜]', '[吃瓜]', '[赞]', '[心]', '[伤心]', '[蜡烛]', '[给力]', '[威武]', '[跪了]', '[中国赞]', '[给你小心心]', '[酸]']
        
        self.id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
        self.label2id = {v: k for k, v in enumerate(label_dic)}

In [9]:
config = Config()
train_data = read_json("/content/drive/My Drive/FYP_Dataset/train.json")
valid_data = read_json("/content/drive/My Drive/FYP_Dataset/valid.json")
test_data = read_json("/content/drive/My Drive/FYP_Dataset/test.json")


In [10]:
#@ with : and @ without :
def clean_at(text):
    at_pattern = re.compile("@\S*:", re.S)
    text = re.sub(at_pattern, "", text)
    at_pattern = re.compile("@\S*", re.S)
    text = re.sub(at_pattern, "", text)
    return text.strip()

#Clear url in comments
def clean_url(text):
    sentences = text.split(' ')
    # handle http:// link
    url_pattern = re.compile(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b', re.S)
    # handle link without http://
    domain_pattern = re.compile(r'(\b)*(.*?)\.(com|cn)')
    if len(sentences) > 0:
        result = []
        for item in sentences:
            text = re.sub(url_pattern, '', item)
            text = re.sub(domain_pattern,'', text)
            result.append(text)
        return ' '.join(result)
    else:
        return re.sub(url_pattern, '', sentences)

for i in range(len(train_data)):
    for j in range(len(train_data[i]["comments"])):
        text = train_data[i]["comments"][j]
        text_clean_at = clean_at(text)
        text_clean_url = clean_url(text_clean_at)
        train_data[i]["comments"][j] = text_clean_url

In [11]:
tokenizer = BertTokenizer.from_pretrained(config.PTM)

train_dataloader = DataLoader(BaselineData(train_data, tokenizer, config), batch_size=config.batch_size)
valid_dataloader = DataLoader(BaselineData(valid_data, tokenizer, config), batch_size=config.batch_size)
test_dataloader = DataLoader(BaselineData(test_data, tokenizer, config), batch_size=1)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [12]:
model = Model(config).to(config.device)
optimizer = torch.optim.AdamW(model.parameters(), config.lr)

Downloading pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
best_model = train(config, train_dataloader, model, optimizer, valid_dataloader)
generate_test_result(config, test_dataloader, best_model)

  7%|▋         | 22/300 [00:42<08:59,  1.94s/it, loss=0.4027]Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 300/300 [10:24<00:00,  2.08s/it, loss=0.2771]


micro_f1: 0.3617
macro_f1: 0.1385


100%|██████████| 300/300 [10:30<00:00,  2.10s/it, loss=0.2245]


micro_f1: 0.4497
macro_f1: 0.2019


100%|██████████| 300/300 [10:31<00:00,  2.10s/it, loss=0.1862]


micro_f1: 0.4894
macro_f1: 0.2379


100%|██████████| 300/300 [10:30<00:00,  2.10s/it, loss=0.1723]


micro_f1: 0.5188
macro_f1: 0.2635


100%|██████████| 300/300 [10:30<00:00,  2.10s/it, loss=0.1556]


micro_f1: 0.4994
macro_f1: 0.2597


100%|██████████| 300/300 [10:30<00:00,  2.10s/it, loss=0.1304]


micro_f1: 0.5346
macro_f1: 0.2829


100%|██████████| 300/300 [10:32<00:00,  2.11s/it, loss=0.0957]


micro_f1: 0.5348
macro_f1: 0.2944


100%|██████████| 300/300 [10:31<00:00,  2.10s/it, loss=0.0759]


micro_f1: 0.5143
macro_f1: 0.3179


100%|██████████| 300/300 [10:31<00:00,  2.11s/it, loss=0.0594]


micro_f1: 0.5300
macro_f1: 0.3194


100%|██████████| 300/300 [10:31<00:00,  2.10s/it, loss=0.0496]


micro_f1: 0.5401
macro_f1: 0.3246


100%|██████████| 300/300 [10:31<00:00,  2.10s/it, loss=0.0277]


micro_f1: 0.5448
macro_f1: 0.3268


100%|██████████| 300/300 [10:31<00:00,  2.10s/it, loss=0.0273]


micro_f1: 0.5408
macro_f1: 0.3285


100%|██████████| 300/300 [10:31<00:00,  2.10s/it, loss=0.0201]


micro_f1: 0.5360
macro_f1: 0.3401


100%|██████████| 300/300 [10:31<00:00,  2.11s/it, loss=0.0075]


micro_f1: 0.5467
macro_f1: 0.3587


100%|██████████| 300/300 [10:31<00:00,  2.10s/it, loss=0.0052]


micro_f1: 0.5438
macro_f1: 0.3450


In [14]:
label_dic = ['[微笑]', '[嘻嘻]', '[笑cry]', '[怒]', '[泪]', '[允悲]', '[憧憬]', '[doge]', '[并不简单]', '[思考]', '[费解]', '[吃惊]',
                     '[拜拜]',
                     '[吃瓜]', '[赞]', '[心]', '[伤心]', '[蜡烛]', '[给力]', '[威武]', '[跪了]', '[中国赞]', '[给你小心心]', '[酸]']
id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
label2id = {v: k for k, v in enumerate(label_dic)}

def convert_label(fn_result):
    convert_label = []

    for line in open(fn_result, 'r', encoding='utf-8'):
        labellist = line.strip().split(' ')[1:]

        onehot_label = [0] * 24
        for label in labellist:
            onehot_label[label2id[label]] = 1
        convert_label.append(onehot_label)
    return convert_label

def macro_calculation(pred, true):
    macro_f1 = f1_score(pred, true, average='macro')
    return macro_f1

def micro_calculation(pred, true):
    micro_f1 = f1_score(pred, true, average='micro')
    return micro_f1

def evaluate():
    """
    Generate classification_report from given pred and gold tsv files.
    """

    pred_fn = "/content/drive/MyDrive/FYP_Dataset/submit.txt"
    gold_fn = "/content/drive/MyDrive/FYP_Dataset/test_golden.txt"
    print('Loading the datasets ...')
    pred_lbl = convert_label(pred_fn)
    gold_lbl = convert_label(gold_fn)

    print("Evaluating ...")
    try:
        macro_f1 = macro_calculation(pred_lbl, gold_lbl)
        print('macro_f1: {:.4f}'.format(macro_f1))
        micro_f1 = micro_calculation(pred_lbl, gold_lbl)
        print('micro_f1: {:.4f}'.format(micro_f1))
    except Exception as ex:
        print('error:', ex)


In [15]:
evaluate()

Loading the datasets ...
Evaluating ...
macro_f1: 0.3623
micro_f1: 0.5392
