In [19]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel, BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import f1_score
# from google.colab import drive

# drive.mount("/content/drive")


In [20]:
def read_json(path):
    data = []
    with open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            data.append(json.loads(line))
    return data


In [21]:
class BaselineData(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.pad_size = config.pad_size
        self.label2id = config.label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = "[SEP]".join(
            [self.data[idx]["hashtag"]] + self.data[idx]["comments"]
        )

        input_ids, attention_mask = self.__convert_to_id__(sentence)

        if self.data[idx].get("attitudes"):
            label = self.__convert_label__(self.data[idx]["attitudes"])
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
                torch.tensor(label),
            )
        else:
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
            )

    def __convert_to_id__(self, sentence):
        ids = self.tokenizer.encode_plus(sentence)
        input_ids = self.__padding__(ids["input_ids"])
        attention_mask = self.__padding__(ids["attention_mask"])

        return input_ids, attention_mask

    def __convert_label__(self, label):
        onehot_label = [0] * 24
        for i in label:
            onehot_label[self.label2id[i]] = 1
        return onehot_label

    def __padding__(self, sentence):
        sentence = sentence[: self.pad_size]  # 长就截断
        sentence = sentence + [0] * (self.pad_size - len(sentence))  # 短就补充
        return sentence


In [22]:
class Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.bert = BertModel.from_pretrained(config.PTM)
        self.bert_config = BertConfig.from_pretrained(config.PTM)
        self.fc = nn.Linear(self.bert_config.hidden_size, self.bert_config.hidden_size)
        self.fc1 = nn.Linear(self.bert_config.hidden_size, config.label_num)
        self.act = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.bert(input_ids=x[0], attention_mask=x[1]).pooler_output
        x = self.sigmoid(self.fc1(self.act(self.fc(x))))
        return x


In [23]:
def train(config, dataset, model, optimizer, valid_dataset):
    max_scores = 0
    for epoch in range(config.epochs):
        with tqdm(total=len(dataset)) as pbar:
            for idx, data in enumerate(dataset):
                x = [data[0].long().to(config.device), data[1].long().to(config.device)]
                y = data[2].float().to(config.device)
                y_hat = model(x)
                loss = F.binary_cross_entropy(y_hat, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                pbar.set_postfix({"loss": "{:.4f}".format(loss)})
                pbar.update(1)
        scores = valid(config, valid_dataset, model)
        if scores >= max_scores:
            max_scores = scores
            saved_model = model
    return saved_model


def valid(config, dataset, model):
    true = []
    pred = []
    with torch.no_grad():
        for idx, data in enumerate(dataset):
            x = [data[0].long().to(config.device), data[1].long().to(config.device)]
            y = data[2].float().to(config.device).view(-1, 24).tolist()
            y_hat = model(x).view(-1, 24).tolist()
            true.extend(y)
            pred.extend(y_hat)

    pred = [[1 if i >= 0.5 else 0 for i in j] for j in pred]

    micro_f1 = f1_score(pred, true, average="micro")
    macro_f1 = f1_score(pred, true, average="macro")
    print("micro_f1: {:.4f}".format(micro_f1))
    print("micro_f1: {:.4f}".format(macro_f1))
    return micro_f1 * 0.4 + macro_f1 * 0.6


import heapq


def generate_test_result(config, dataset, model):
    with torch.no_grad():
        predict = []
        for idx, data in enumerate(dataset):
            x = [data[0].long().to(config.device), data[1].long().to(config.device)]
            predict.extend(model(x).view(-1, 24).tolist())
    with open("submit.txt", "w", encoding="utf-8") as f:
        for i in range(len(predict)):
            line = []

            max_number = heapq.nlargest(3, predict[i])
            # max_index = []
            for t in max_number:
                index = predict[i].index(t)
                # max_index.append(index)
                line.append(config.id2label[index])

            # for j in range(len(predict[i])):
            #     if predict[i][j] >= 0.5:
            #         line.append(config.id2label[j])

            f.write(" ".join([str(i)] + line))
            f.write("\n")


In [24]:
import heapq

m = [
    0.016138209030032158,
    0.1027437224984169,
    0.007721974980086088,
    0.015482794493436813,
    0.03860097751021385,
    0.01681477390229702,
    0.0018505491316318512,
    0.2727064788341522,
    0.10750023275613785,
    0.10568385571241379,
    0.47227057814598083,
    0.059659115970134735,
    0.02180691622197628,
    0.7891892194747925,
    0.618352472782135,
    0.23794208467006683,
    0.011238433420658112,
    0.11568883806467056,
    0.07733272016048431,
    0.009712048806250095,
    0.011674805544316769,
    0.12361800670623779,
    0.00770289683714509,
    0.055381402373313904,
]
max_number = heapq.nlargest(3, m)
max_index = []
for t in max_number:
    index = m.index(t)
    max_index.append(index)
print(max_number)
print(max_index)


[0.7891892194747925, 0.618352472782135, 0.47227057814598083]
[13, 14, 10]


In [25]:
class Config:
    def __init__(self):
        self.pad_size = 500
        self.batch_size = 24
        self.epochs = 15
        self.PTM = "bert-base-chinese"
        self.label_num = 24
        self.device = "cuda:0"
        self.lr = 5e-5

        label_dic = [
            "[微笑]",
            "[嘻嘻]",
            "[笑cry]",
            "[怒]",
            "[泪]",
            "[允悲]",
            "[憧憬]",
            "[doge]",
            "[并不简单]",
            "[思考]",
            "[费解]",
            "[吃惊]",
            "[拜拜]",
            "[吃瓜]",
            "[赞]",
            "[心]",
            "[伤心]",
            "[蜡烛]",
            "[给力]",
            "[威武]",
            "[跪了]",
            "[中国赞]",
            "[给你小心心]",
            "[酸]",
        ]

        self.id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
        self.label2id = {v: k for k, v in enumerate(label_dic)}


In [26]:
config = Config()
train_data = read_json("train.json")
valid_data = read_json("valid.json")
test_data = read_json("test.json")

tokenizer = BertTokenizer.from_pretrained(config.PTM)

train_dataloader = DataLoader(
    BaselineData(train_data, tokenizer, config), batch_size=config.batch_size
)
valid_dataloader = DataLoader(
    BaselineData(valid_data, tokenizer, config), batch_size=config.batch_size
)
test_dataloader = DataLoader(BaselineData(test_data, tokenizer, config), batch_size=1)


In [27]:
train_data[8]


{'hashtag': '任嘉伦谭松韵互叫本名',
 'attitudes': ['[笑cry]', '[允悲]', '[心]'],
 'comments': ['晶晶啊晶晶啊',
  '哈哈哈哈哈哈两个人好可爱',
  '突然觉得自己看锦衣之下之前就是个瞎子',
  '喊了四个晶晶啊，因为锦衣之下，因为陆大人今夏，嘉伦松韵成了家人一样的亲切关系，祝福友谊之树长青。@任嘉伦Allen @谭松韵seven @锦衣之下电视剧',
  '国超“晶晶啊，晶晶啊，晶晶啊”，晶晶“哎”，国超“你可以搜一下任嘉伦”，晶晶“我嗖啦”。']}

In [28]:
train_data[37]


{'hashtag': '武汉一餐厅疫情期送8000多份爱心餐',
 'attitudes': ['[心]', '[泪]', '[赞]'],
 'comments': ['餐厅名字，三串肆季  王家墩 送我上去',
  '活该你店火🔥你知道吧',
  '很棒的母亲',
  '总有人竭力温暖这个世界！致敬！可敬的母亲',
  '可敬的妈妈，您的孩子一定以你为榜样的，母亲节快乐❤️']}

In [29]:
train_data[44]


{'hashtag': '近视眼的日常',
 'attitudes': ['[允悲]', '[泪]', '[doge]'],
 'comments': ['脑洞翻唱 【wherever you are→http://t.cn/AiO9oJhZ】 【肥宅的惰性→http://t.cn/AiOSBn6m】【星巴克的约定→http://t.cn/AiWqi9aB】 【吃货肥宅们的日常→http://t.cn/AiWqi9am】【熊孩纸→ http://t.cn/AiF2FclT】【理发店→ http://t.cn/AiF2FclN】【做饭难吃→ http://t.cn/A6hSy9ZY】',
  '你說什麼等等我沒戴眼鏡聽不清',
  '以前做视力检查，问过医生，指视力表的棍子在哪（眯眼',
  '曾经没戴眼镜，以为是叶子，结果徒手抓了只壁虎',
  '  “夜晚的灯光好美啊 看起来像彩灯秀”太真实了吧！！！散光眼表示摘下眼镜 可以被路灯晃晕']}

In [30]:
import re


# @ with : and @ without :
def clean_at(text):
    at_pattern = re.compile("@\S*:", re.S)
    text = re.sub(at_pattern, "", text)
    at_pattern = re.compile("@\S*", re.S)
    text = re.sub(at_pattern, "", text)
    return text.strip()


# Clear url in comments
def clean_url(text):
    sentences = text.split(" ")
    # 处理http://类链接
    url_pattern = re.compile(r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\-)*\b", re.S)
    # 处理无http://类链接
    domain_pattern = re.compile(r"(\b)*(.*?)\.(com|cn)")
    if len(sentences) > 0:
        result = []
        for item in sentences:
            text = re.sub(url_pattern, "", item)
            text = re.sub(domain_pattern, "", text)
            result.append(text)
        return " ".join(result)
    else:
        return re.sub(url_pattern, "", sentences)


for i in range(len(train_data)):
    for j in range(len(train_data[i]["comments"])):
        text = train_data[i]["comments"][j]
        text_clean_at = clean_at(text)
        text_clean_url = clean_url(text_clean_at)
        train_data[i]["comments"][j] = text_clean_url

train_data[44]


{'hashtag': '近视眼的日常',
 'attitudes': ['[允悲]', '[泪]', '[doge]'],
 'comments': ['脑洞翻唱 【wherever you are→】 【肥宅的惰性→】【星巴克的约定→】 【吃货肥宅们的日常→】【熊孩纸→ 】【理发店→ 】【做饭难吃→ 】',
  '你說什麼等等我沒戴眼鏡聽不清',
  '以前做视力检查，问过医生，指视力表的棍子在哪（眯眼',
  '曾经没戴眼镜，以为是叶子，结果徒手抓了只壁虎',
  '“夜晚的灯光好美啊 看起来像彩灯秀”太真实了吧！！！散光眼表示摘下眼镜 可以被路灯晃晕']}

In [31]:
model = Model(config).to(config.device)
optimizer = torch.optim.AdamW(model.parameters(), config.lr)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
best_model = train(config, train_dataloader, model, optimizer, valid_dataloader)
generate_test_result(config, test_dataloader, best_model)


  0%|          | 0/300 [00:00<?, ?it/s]


AssertionError: Torch not compiled with CUDA enabled

In [None]:
import sys

label_dic = [
    "[微笑]",
    "[嘻嘻]",
    "[笑cry]",
    "[怒]",
    "[泪]",
    "[允悲]",
    "[憧憬]",
    "[doge]",
    "[并不简单]",
    "[思考]",
    "[费解]",
    "[吃惊]",
    "[拜拜]",
    "[吃瓜]",
    "[赞]",
    "[心]",
    "[伤心]",
    "[蜡烛]",
    "[给力]",
    "[威武]",
    "[跪了]",
    "[中国赞]",
    "[给你小心心]",
    "[酸]",
]
id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
label2id = {v: k for k, v in enumerate(label_dic)}


def convert_label(fn_result):
    convert_label = []

    for line in open(fn_result, "r", encoding="utf-8"):
        labellist = line.strip().split(" ")[1:]

        onehot_label = [0] * 24
        for label in labellist:
            onehot_label[label2id[label]] = 1
        convert_label.append(onehot_label)
    return convert_label


def score_calculation(pred, true):
    macro_f1 = f1_score(pred, true, average="macro")
    return macro_f1


def main():
    """
    Generate classification_report from given pred and gold tsv files.
    """

    # Usage:
    # python evaluate PREDICTION_FILE GOLD_ANSWER_FILE
    # Example:
    # python evaluate pred.tsv gold.tsv
    print(len(sys.argv))
    print("The list of command line arguments:\n", sys.argv)

    if len(sys.argv) < 3:
        print("Please indicate the prediction and gold tsvs.")
        quit()
    else:
        pred_fn = sys.argv[1]
        gold_fn = sys.argv[2]

    print("Loading the datasets ...")
    pred_lbl = convert_label(pred_fn)
    gold_lbl = convert_label(gold_fn)

    print("Evaluating ...")
    try:
        macro_f1 = score_calculation(pred_lbl, gold_lbl)

        print("macro_f1: {:.4f}".format(macro_f1))
    except Exception as ex:
        print("error:", ex)
