In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel, BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import f1_score
# from google.colab import drive

# drive.mount("/content/drive")


  from .autonotebook import tqdm as notebook_tqdm


In [32]:
def read_json(path):
    data = []
    with open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            data.append(json.loads(line))
    return data


train_data = read_json("data_emotion/train.json")
valid_data = read_json("data_emotion/valid.json")
test_data = read_json("data_emotion/test_1.json")


In [3]:
class Config:
    def __init__(self):
        self.pad_size = 500
        self.batch_size = 24
        self.epochs = 10
        self.PTM = "bert-base-chinese"
        self.label_num = 24
        self.device = "cuda:0"
        self.lr = 5e-5

        label_dic = [
            "[微笑]",
            "[嘻嘻]",
            "[笑cry]",
            "[怒]",
            "[泪]",
            "[允悲]",
            "[憧憬]",
            "[doge]",
            "[并不简单]",
            "[思考]",
            "[费解]",
            "[吃惊]",
            "[拜拜]",
            "[吃瓜]",
            "[赞]",
            "[心]",
            "[伤心]",
            "[蜡烛]",
            "[给力]",
            "[威武]",
            "[跪了]",
            "[中国赞]",
            "[给你小心心]",
            "[酸]",
        ]

        self.id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
        self.label2id = {v: k for k, v in enumerate(label_dic)}


config = Config()


In [4]:
tokenizer = BertTokenizer.from_pretrained(config.PTM)


In [5]:
class BaselineData(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.pad_size = config.pad_size
        self.label2id = config.label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = "[SEP]".join(
            [self.data[idx]["hashtag"]] + self.data[idx]["comments"]
        )

        input_ids, attention_mask = self.__convert_to_id__(sentence)

        if self.data[idx].get("attitudes"):
            label = self.__convert_label__(self.data[idx]["attitudes"])
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
                torch.tensor(label),
            )
        else:
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
            )

    def __convert_to_id__(self, sentence):
        ids = self.tokenizer.encode_plus(sentence)
        input_ids = self.__padding__(ids["input_ids"])
        attention_mask = self.__padding__(ids["attention_mask"])

        return input_ids, attention_mask

    def __convert_label__(self, label):
        onehot_label = [0] * 24
        for i in label:
            onehot_label[self.label2id[i]] = 1
        return onehot_label

    def __padding__(self, sentence):
        sentence = sentence[: self.pad_size]  # 长就截断
        sentence = sentence + [0] * (self.pad_size - len(sentence))  # 短就补充
        return sentence


In [12]:
train_baselinedata = BaselineData(train_data, tokenizer, config)
# valid_baselinedata = BaselineData(valid_data, tokenizer, config)
# test_baselinedata = BaselineData(test_data, tokenizer, config)


In [34]:
sentence = "[SEP]".join([train_data[0]["hashtag"]] + train_data[0]["comments"])
len(sentence)
ids = tokenizer.encode_plus(sentence)
ids
# len(ids["input_ids"])
# ids["input_ids"][:500]
# ids["input_ids"] + [0] * (500 - len(ids["input_ids"]))
# len(ids["attention_mask"])
# ids["attention_mask"][:500]
# ids["attention_mask"] + [0] * (500 - len(ids["attention_mask"]))
# train_data[0].get("attitudes")

# torch.tensor(ids["input_ids"])

1969

In [30]:
train_baselinedata[0]
 

(tensor([ 101, 5908, 5812, 7313,  166, 8118, 8220, 8178,  102, 3342, 2780,  800,
         1922, 1377, 4263,  749, 1449, 1449, 1449, 8024, 3633, 1762, 4692, 8013,
         2769, 6158, 5846, 7835,  749, 8013,  102,  872, 3297, 2358, 8013, 8013,
         8013, 5710, 2552, 5288, 4125, 4306, 8013, 8013, 8013, 2218, 3221,  872,
         8013, 8013, 8013, 8013,  102, 1506, 1506, 1506, 1506, 1506, 8024, 3145,
         1377, 4263, 4638, 3342, 2140, 2140,  102, 3342, 2140, 2140, 1922, 1377,
         4263, 2685,  102, 1745, 4275, 6397, 6389,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  

In [None]:
train_dataloader = DataLoader(
    BaselineData(train_data, tokenizer, config), batch_size=config.batch_size
)
valid_dataloader = DataLoader(
    BaselineData(valid_data, tokenizer, config), batch_size=config.batch_size
)
test_dataloader = DataLoader(BaselineData(test_data, tokenizer, config), batch_size=1)


In [None]:
import re
import torch

text = "回复@魏小河才不是乖乖的小盆宇:完全控制以前都不能放松"
at_pattern = re.compile("@\S*:", re.S)
text = re.sub(at_pattern, "", text)
at_pattern = re.compile("@\S*:", re.S)
text = re.sub(at_pattern, "", text)
print(text.strip())
print(torch.cuda.is_available())

In [None]:
import json
from transformers import BertConfig, BertModel, BertTokenizer

def read_json(path):
    data = []
    with open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            data.append(json.loads(line))
    return data


train_data = read_json("train.json")
train_data


In [None]:
train_data[1382]

In [None]:
sentence = "[SEP]".join([train_data[1382]["hashtag"]] + train_data[1382]["comments"])
sentence
len(sentence)

In [None]:
class Config:
    def __init__(self):
        self.pad_size = 500
        self.batch_size = 24
        self.epochs = 1
        self.PTM = "Yuang/unilm-base-chinese-news-sum"
        self.label_num = 24
        self.device = "cuda:0"
        self.lr = 5e-5

        label_dic = [
            "[微笑]",
            "[嘻嘻]",
            "[笑cry]",
            "[怒]",
            "[泪]",
            "[允悲]",
            "[憧憬]",
            "[doge]",
            "[并不简单]",
            "[思考]",
            "[费解]",
            "[吃惊]",
            "[拜拜]",
            "[吃瓜]",
            "[赞]",
            "[心]",
            "[伤心]",
            "[蜡烛]",
            "[给力]",
            "[威武]",
            "[跪了]",
            "[中国赞]",
            "[给你小心心]",
            "[酸]",
        ]

        self.id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
        self.label2id = {v: k for k, v in enumerate(label_dic)}


config = Config()


In [None]:
tokenizer = BertTokenizer.from_pretrained(config.PTM)

In [None]:
from torch.utils.data import Dataset, DataLoader


class BaselineData(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.pad_size = config.pad_size
        self.label2id = config.label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = "[SEP]".join(
            [self.data[idx]["hashtag"]] + self.data[idx]["comments"]
        )

        input_ids, attention_mask = self.__convert_to_id__(sentence)

        if self.data[idx].get("attitudes"):
            label = self.__convert_label__(self.data[idx]["attitudes"])
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
                torch.tensor(label),
            )
        else:
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
            )

    def __convert_to_id__(self, sentence):
        ids = self.tokenizer.encode_plus(sentence)
        input_ids = self.__padding__(ids["input_ids"])
        attention_mask = self.__padding__(ids["attention_mask"])

        return input_ids, attention_mask

    def __convert_label__(self, label):
        onehot_label = [0] * 24
        for i in label:
            onehot_label[self.label2id[i]] = 1
        return onehot_label

    def __padding__(self, sentence):
        sentence = sentence[: self.pad_size]  # 长就截断
        sentence = sentence + [0] * (self.pad_size - len(sentence))  # 短就补充
        return sentence


In [None]:
[0] * 9

In [None]:


BaselineData(train_data, tokenizer, config)