In [1]:
import csv
import torch
import random
from tqdm import tqdm
from transformers import BertModel, BertTokenizer, logging

In [2]:
ATT_TABLE = {
    '高兴': 0, '搞笑': 1, '期待': 2, '肯定': 3, '感动': 4, '悲伤': 5, 
    '愤怒': 6, '厌恶': 7, '担心': 8, '无聊': 9, '警惕': 10, '惊讶': 11, '无所谓': 12,
}

device = 'cuda' if(torch.cuda.is_available()) else 'cpu'
device = 'cpu'

### 加载模型

In [3]:
logging.set_verbosity_error()   # 消除未使用权重的warning

pretrain=BertModel.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment')
tokenizer=BertTokenizer.from_pretrained('IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment')

In [4]:
inputs = tokenizer.encode('嘿嘿', return_tensors='pt')
pretrain(inputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.9940, -0.9275, -0.7588,  ..., -0.0946, -1.5457, -1.3777],
         [ 0.2687, -0.8461, -1.0813,  ...,  0.4385, -2.4636, -1.1470],
         [ 0.2276, -0.7463, -0.6477,  ...,  0.5365, -1.5191, -1.0243],
         [ 0.9938, -0.9280, -0.7585,  ..., -0.0955, -1.5450, -1.3811]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9016,  0.3281, -0.1157,  ..., -0.3664, -0.2742,  0.3941]],
       grad_fn=<TanhBackward0>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

### 数据集

In [5]:
def read_from_csv(filename):
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        data = list(reader)
    return data

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        """输入data格式: 
            [['id', 'text', 'label', 'Comments'],
             [xxx, xxx, xxx, xxx],
             [xxx, xxx, xxx, xxx],
             ...]
        """
        
        super().__init__()
        data = [[d[1], d[2]] for d in data[1:] if d[2].strip()!= '']
        
        random.shuffle(data)
        train_len = int(0.9 * len(data))
        self.train_data = data[:train_len]
        self.test_data = data[train_len:]
        
        self.train = True
        
    def __len__(self):
        if(self.train):
            return len(self.train_data)
        else:
            return len(self.test_data)
        
    def __getitem__(self, idx):
        if(self.train):
            return self.train_data[idx]
        else:
            return self.test_data[idx]

data = read_from_csv('./dataset/attitude_classify/all.csv')
dataset = Dataset(data)

In [7]:
def collate_fn(data):
    contents = [i[0].split('###') for i in data]
    labels = [ATT_TABLE[i[1]] for i in data]
    
    contents = tokenizer.batch_encode_plus(batch_text_or_text_pairs=contents, return_tensors="pt", padding=True, add_special_tokens=True, return_token_type_ids=True)
    
    data = {}
    data['input_ids'] = contents['input_ids'].to(device)
    data['attention_mask'] = contents['attention_mask'].to(device)
    data['token_type_ids'] = contents['token_type_ids'].to(device)
    data['labels'] = torch.tensor(labels, dtype=torch.long).to(device)

    return data

loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=4,
                                     collate_fn=collate_fn,
                                     drop_last=True)

### 模型

In [8]:
class RobertaModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.pretrain = pretrain
        self.dropout = torch.nn.Dropout(0.1, inplace=False)
        self.classifier = torch.nn.Linear(1024, 13)
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask=None, labels=None, token_type_ids=None):
        rt = {'loss': None, 'cls': None}
        
        out = self.pretrain(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)['pooler_output']
        out = self.dropout(out)
        out = self.classifier(out)
        rt['cls'] = out
                
        if(labels is not None):
            rt['loss'] = self.criterion(out, labels)
        return rt

model = RobertaModel()

### 训练

In [None]:
def train(model, epoches, lr):
    lens = len(loader)
    model = model.train().to(device)
    optim = torch.optim.AdamW(model.parameters(), lr=lr)
    
    losses = torch.zeros((epoches, lens))
    for i in range(epoches):
        with tqdm(total=lens, ncols=80) as bar:
            bar.set_description('训练进度-epoch: {}/{}'.format(i+1,epoches))
            for n,d in enumerate(loader):
                loss = model(**d)['loss']
                loss.backward()
                optim.step()
                optim.zero_grad()
                
                losses[i,n] += loss.item()
                bar.update(1)
                
            bar.set_postfix(loss = '{:.4f}'.format(losses[i].mean().item()))
    
    model.eval().cpu()
    return losses

losses = train(model, 5, 2e-5)
torch.save(model, './models/attitude_classify.model')

### 使用

In [9]:
def classify(model, text):
    model = model.eval()
    inputs = tokenizer.encode(text, add_special_tokens=True, return_tensors='pt')
    out = model(inputs)['cls']
    res = out.argmax(dim=1).item()
    return list(ATT_TABLE.keys())[res]

In [10]:
# 导入模型
model = torch.load('./models/attitude_classify.model')

In [11]:
dataset.train = False
classify(model, '呵呵呵呵')

'肯定'

### 测试

In [16]:
def precision(model, dataset):
    total = correct = 0
    model = model.eval()
    dataset.train = False
    loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=4, collate_fn=collate_fn, drop_last=True)

    for inputs in tqdm(loader):
        out = model(**inputs)['cls']
        res = out.argmax(dim=1)
        # 选择label
        std = inputs['labels']
        # 计算正确数
        correct += (std == res).sum()
        total += len(std)

    return correct / total

precision(model, dataset)

 11%|█▏        | 5/44 [00:04<00:30,  1.26it/s]