In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel, BertTokenizer
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import f1_score
# from google.colab import drive

# drive.mount("/content/drive")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_json(path):
    data = []
    with open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            data.append(json.loads(line))
    return data


train_data = read_json("data_emotion/train.json")
valid_data = read_json("data_emotion/valid.json")
test_data = read_json("data_emotion/test_1.json")


In [3]:
class Config:
    def __init__(self):
        self.pad_size = 500
        self.batch_size = 24
        self.epochs = 10
        self.PTM = "bert-base-chinese"
        self.label_num = 24
        self.device = "cuda:0"
        self.lr = 5e-5

        label_dic = [
            "[微笑]",
            "[嘻嘻]",
            "[笑cry]",
            "[怒]",
            "[泪]",
            "[允悲]",
            "[憧憬]",
            "[doge]",
            "[并不简单]",
            "[思考]",
            "[费解]",
            "[吃惊]",
            "[拜拜]",
            "[吃瓜]",
            "[赞]",
            "[心]",
            "[伤心]",
            "[蜡烛]",
            "[给力]",
            "[威武]",
            "[跪了]",
            "[中国赞]",
            "[给你小心心]",
            "[酸]",
        ]

        self.id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
        self.label2id = {v: k for k, v in enumerate(label_dic)}


config = Config()


In [4]:
tokenizer = BertTokenizer.from_pretrained(config.PTM)


In [5]:
class BaselineData(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.pad_size = config.pad_size
        self.label2id = config.label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = "[SEP]".join(
            [self.data[idx]["hashtag"]] + self.data[idx]["comments"]
        )

        input_ids, attention_mask = self.__convert_to_id__(sentence)

        if self.data[idx].get("attitudes"):
            label = self.__convert_label__(self.data[idx]["attitudes"])
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
                torch.tensor(label),
            )
        else:
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
            )

    def __convert_to_id__(self, sentence):
        ids = self.tokenizer.encode_plus(sentence)
        input_ids = self.__padding__(ids["input_ids"])
        attention_mask = self.__padding__(ids["attention_mask"])

        return input_ids, attention_mask

    def __convert_label__(self, label):
        onehot_label = [0] * 24
        for i in label:
            onehot_label[self.label2id[i]] = 1
        return onehot_label

    def __padding__(self, sentence):
        sentence = sentence[: self.pad_size]  # 长就截断
        sentence = sentence + [0] * (self.pad_size - len(sentence))  # 短就补充
        return sentence


In [17]:
sentence = "".join([train_data[0]["hashtag"]] + train_data[0]["comments"])
len(sentence)
# ids = tokenizer.encode_plus(sentence)
# len(ids['input_ids'])

2391

In [6]:
train_baselinedata = BaselineData(train_data, tokenizer, config)
# valid_baselinedata = BaselineData(valid_data, tokenizer, config)
# test_baselinedata = BaselineData(test_data, tokenizer, config)


In [8]:
train_baselinedata[0]


Token indices sequence length is longer than the specified maximum sequence length for this model (1969 > 512). Running this sequence through the model will result in indexing errors


(tensor([  101,  7716,   756,  6448,   704,  1744,   711,   862,  6677,   679,
          1962,  6639,  4413,   102,  1726,  2157,  2372,  2372,  2101,  2094,
           679,  1962,   720,   102,  7716,   756,  2768,  1216,   749,  8024,
          6432,   784,   720,  6963,  3221,  3633,  4802,  4638,   119,   119,
           119,   119,   119,   119,   119,   119,   102,  4802,  2141,  8024,
          5074,  4413,  3766,  5381,   738,   679,  6121,  8024,  2961,  4413,
          3300,  5381,  2218,  1326,  2154,   102,  2769,   812,   794,  2207,
          4638,  3136,  5509,  7027,  1174,  2692,  6912,  1048,  4684,  2970,
          2190,  2834,  2595,  1103,  4960,  8024,  1072,  1906,  6821,  4905,
          4684,  2970,  2190,  2834,  1103,  4960,  5543,  1213,  4638,  2111,
          2094,  8024,  2523,  3193,  7348,  3667,  2218,  6158,  6848,  3332,
          3322,  1169,  5314,  6814,  4009,  2957,   749,  8024,   800,   812,
          3221,  5439,  2360,  1469,  5143,  5320,  

In [None]:
train_dataloader = DataLoader(
    BaselineData(train_data, tokenizer, config), batch_size=config.batch_size
)
valid_dataloader = DataLoader(
    BaselineData(valid_data, tokenizer, config), batch_size=config.batch_size
)
test_dataloader = DataLoader(BaselineData(test_data, tokenizer, config), batch_size=1)


In [None]:
import re
import torch

text = "回复@魏小河才不是乖乖的小盆宇:完全控制以前都不能放松"
at_pattern = re.compile("@\S*:", re.S)
text = re.sub(at_pattern, "", text)
at_pattern = re.compile("@\S*:", re.S)
text = re.sub(at_pattern, "", text)
print(text.strip())
print(torch.cuda.is_available())

In [1]:
import json
from transformers import BertConfig, BertModel, BertTokenizer

def read_json(path):
    data = []
    with open(path, "r", encoding="utf-8") as fp:
        for line in fp:
            data.append(json.loads(line))
    return data


train_data = read_json("train.json")
train_data


  from .autonotebook import tqdm as notebook_tqdm


[{'hashtag': '蓬莱间xswl',
  'attitudes': ['[憧憬]', '[笑cry]', '[费解]'],
  'comments': ['杨戬他太可爱了呜呜呜，正在看！我被萌鲨了！',
   ' 你最帅！！！芳心纵火犯！！！就是你！！！！',
   ' 哈哈哈哈哈，敲可爱的杨宝宝',
   '杨宝宝太可爱惹',
   '图片评论']},
 {'hashtag': '俄罗斯驻华大使中文悼念逝者',
  'attitudes': ['[心]', '[泪]', '[中国赞]'],
  'comments': ['谢谢俄罗斯兄弟们', '谢谢俄罗斯，致敬英雄，缅怀逝者', '致敬 缅怀  默哀', '致敬', '病毒必败，生命必胜']},
 {'hashtag': '林更新方否认新恋情',
  'attitudes': ['[doge]', '[吃瓜]', '[笑cry]'],
  'comments': ['我怀疑他在帮思聪管理女友',
   '林狗都说了那肯定不是真的',
   '林更新说是假的那肯定是假的了，因为一般他都懒得回应，这个肯定是太离谱了',
   '还停留在王丽坤的时代...',
   '这真是造谣一张嘴啊！本来就是普通朋友关系，照了很多照片 评论配图']},
 {'hashtag': '英国首相约翰逊身体状况好转',
  'attitudes': ['[赞]', '[吃瓜]', '[doge]'],
  'comments': ['垂死病中惊坐起，笑问客从何处来',
   '约翰逊:你们通告是不是都准备好了，不好意思哈',
   '约翰逊：你们准备好的讣告可以删了了',
   '病好了能不能勤洗头。。',
   '特没谱:怎么样金毛，吃了我给你的连花清瘟是不是好多了']},
 {'hashtag': '货车爆胎1500斤小龙虾洒一地',
  'attitudes': ['[doge]', '[泪]', '[允悲]'],
  'comments': ['着急的我嘴角流下泪水',
   '还好没有村民看到 不然又要抢虾了',
   '损失了1000斤虾 才损失了4000块钱 所以爆出来行业内幕价格？',
   '小龙虾集体逃跑事件',
   '这个吓就是废虾的下场']},
 {'hashtag': '奥运会推迟将致日本损失超3.2

In [6]:
train_data[1382]

{'hashtag': '易烊千玺粉丝 王俊凯粉丝',
 'attitudes': ['[微笑]', '[怒]', '[费解]'],
 'comments': ['追星圈层里不乏年纪小的小朋友，熟悉我的人都知道我不吵架不骂人，易烊千玺饭圈氛围就是专注自家不闹事，大粉每天都在引导散粉追星先做人做人留底线勿失初心。但你们看看，王俊凯大粉都是怎么引导的，饭圈文化已经这样，以后我放心我孩子追星吗？我放心她呆在这样一个价值观畸形的圈层里吗？',
  '如果真的有路人，希望你们能看看。恶者抹黑公众人物后拿公益当自保工具，此行为算什么？大家看看黑号ID，猜猜王俊凯粉丝拿黑号对四字弟弟做过什么，相信各位路人有自己的是非判断。圈层虽然混乱，但我信公道仍在人心。',
  '王俊凯粉丝，你们大可以找一万种方式开脱洗白，联动营销号发公益也好，颠倒黑白倒打一耙也好，有事就多担开除粉籍也好，大家不傻，知道这么多黑号抱团不是一两天就能养成的。证据就是证据，你再怎么洗它就在那里。开黑号的是你们，联动营销号的是你们，拿公益洗白的是你们，教小孩子开黑号的也是你们。',
  '路人可能不知道，那个黑号被扒后我们第一时间私下沟通劝她注销，四字粉人美心善做事向来给别人留路，请问全饭圈还有哪家粉丝能做到我们这样？可结果呢？开黑号本身就不对，给你留时间让你处理结果你转交，我已经够善良了，你们还想四字粉怎样善良呢。被开黑号的是四字，请问你们还想四字粉怎么善良。',
  '王俊凯粉丝别想拿公益洗白了，公益被拿来这么用不觉得恶心吗。公益大家都在做，四字弟弟家修桥铺路建医院建小学，但和你们王俊凯粉丝的区别就是他们不会拿公益当遮羞布。大家心存善意做公益，是想让世界变得美好，不是为了所有人都像王俊凯粉丝一样开黑号不讲道理随意抹黑人的。']}

In [9]:
sentence = "[SEP]".join([train_data[1382]["hashtag"]] + train_data[1382]["comments"])
sentence
len(sentence)

682

In [None]:
class Config:
    def __init__(self):
        self.pad_size = 500
        self.batch_size = 24
        self.epochs = 1
        self.PTM = "Yuang/unilm-base-chinese-news-sum"
        self.label_num = 24
        self.device = "cuda:0"
        self.lr = 5e-5

        label_dic = [
            "[微笑]",
            "[嘻嘻]",
            "[笑cry]",
            "[怒]",
            "[泪]",
            "[允悲]",
            "[憧憬]",
            "[doge]",
            "[并不简单]",
            "[思考]",
            "[费解]",
            "[吃惊]",
            "[拜拜]",
            "[吃瓜]",
            "[赞]",
            "[心]",
            "[伤心]",
            "[蜡烛]",
            "[给力]",
            "[威武]",
            "[跪了]",
            "[中国赞]",
            "[给你小心心]",
            "[酸]",
        ]

        self.id2label = {k: v for k, v in enumerate(label_dic)}  # 用于标签的部分
        self.label2id = {v: k for k, v in enumerate(label_dic)}


config = Config()


In [None]:
tokenizer = BertTokenizer.from_pretrained(config.PTM)

In [None]:
from torch.utils.data import Dataset, DataLoader


class BaselineData(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.pad_size = config.pad_size
        self.label2id = config.label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = "[SEP]".join(
            [self.data[idx]["hashtag"]] + self.data[idx]["comments"]
        )

        input_ids, attention_mask = self.__convert_to_id__(sentence)

        if self.data[idx].get("attitudes"):
            label = self.__convert_label__(self.data[idx]["attitudes"])
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
                torch.tensor(label),
            )
        else:
            return (
                torch.tensor(input_ids),
                torch.tensor(attention_mask),
            )

    def __convert_to_id__(self, sentence):
        ids = self.tokenizer.encode_plus(sentence)
        input_ids = self.__padding__(ids["input_ids"])
        attention_mask = self.__padding__(ids["attention_mask"])

        return input_ids, attention_mask

    def __convert_label__(self, label):
        onehot_label = [0] * 24
        for i in label:
            onehot_label[self.label2id[i]] = 1
        return onehot_label

    def __padding__(self, sentence):
        sentence = sentence[: self.pad_size]  # 长就截断
        sentence = sentence + [0] * (self.pad_size - len(sentence))  # 短就补充
        return sentence


In [1]:
[0] * 9

[0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:


BaselineData(train_data, tokenizer, config)