# 导包

In [None]:
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import torch.utils.data as Data

from sklearn.model_selection import StratifiedKFold

import numpy as np
import pandas as pd

!pip install transformers
import transformers

from google.colab import drive
from tqdm import tqdm
import csv

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 定义超参数

In [None]:
CFG = {
    'fold_num': 5,
    'seed': 2021,
    'model': 'hfl/chinese-bert-wwm-ext', #预训练模型
    'max_len': 100,
    'lr': 0.001, #学习率
    'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'n_class': 3,
    'batch_size': 32,
    'epochs': 1,
}

# 固定种子

In [None]:
def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['seed']) #固定随机种子

# 读取数据

In [None]:
train_df=pd.read_csv('/content/drive/MyDrive/data/Emotion_Recognition/nCoV_100k_train.labled.csv')
test_df=pd.read_csv('/content/drive/MyDrive/data/Emotion_Recognition/nCov_10k_test.csv')

# 数据预处理

## 输入转换

In [None]:
tokenizer=transformers.BertTokenizer.from_pretrained(CFG['model'])

In [None]:
def convert_to_transformers_inputs(instance, tokenizer, max_len):
    inputs = tokenizer.encode_plus(
        instance,
        add_special_tokens=True,
        max_length=max_len,
        truncation_strategy='longest_first'
    )

    input_ids = inputs['input_ids']
    input_masks = inputs['attention_mask']
    input_segments = inputs['token_type_ids']

    # 填充
    padding_len = max_len - len(input_ids)
    padding_id = tokenizer.pad_token_id
    input_ids = input_ids + ([padding_id] * padding_len)
    input_masks = input_masks + ([0] * padding_len)
    input_segments = input_segments + ([0] * padding_len) 
    return [input_ids, input_masks, input_segments]

In [None]:
def compute_input_arrays(df, col, tokenizer, max_len):
    input_ids, input_masks, input_segments = [], [], []
    
    for instance in tqdm(df[col]):
        ids, masks, segments = convert_to_transformers_inputs(str(instance), tokenizer, max_len)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
    
    return [np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]


In [None]:
input_categories = '微博中文内容'
output_categories = '情感倾向'
inputs = compute_input_arrays(train_df, input_categories, tokenizer, CFG['max_len'])
test_inputs = compute_input_arrays(test_df, input_categories, tokenizer, CFG['max_len'])

  0%|          | 0/99913 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 99913/99913 [01:20<00:00, 1247.50it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1224.27it/s]


## 输出转换

In [None]:
def compute_output_arrays(df, col):
    # 从0开始。所以+1
    return np.asarray(df[col].astype(int)+1)

outputs = compute_output_arrays(train_df,output_categories)

# Bert模型


In [None]:
class TextNet(nn.Module):
    def __init__(self): #code_length为fc映射到的维度大小
        super(TextNet, self).__init__()

        bert_config = transformers.BertConfig.from_pretrained(CFG['model'], output_hidden_states=True)
        self.bert_model = transformers.BertModel.from_pretrained(CFG['model'], config=bert_config)
        
        embedding_dim = self.bert_model.config.hidden_size
        self.fc = nn.Linear(embedding_dim, CFG['n_class'])        

    def forward(self, ids, masks, segments):
        output=self.bert_model(ids, attention_mask=masks, token_type_ids=segments)
        text_embeddings = output[0][:, 0, :]  
        #output[0](batch size, sequence length, model hidden dimension)

        features = self.fc(text_embeddings)
        return features

# 训练

In [None]:
folds = StratifiedKFold(n_splits=CFG['fold_num'], shuffle=True, random_state=CFG['seed'])\
                    .split(train_df[input_categories].fillna('-1'), train_df[output_categories].fillna('-1')) #五折交叉验证
for fold, (train_idx, valid_idx) in enumerate(folds):

    train_inputs = torch.LongTensor([inputs[i][train_idx] for i in range(len(inputs))])
    train_inputs = train_inputs.transpose(0, 1)
    train_outputs = torch.LongTensor(outputs[train_idx])

    valid_inputs = torch.LongTensor([inputs[i][valid_idx] for i in range(len(inputs))])
    valid_inputs = valid_inputs.transpose(0, 1)
    valid_outputs = torch.LongTensor(outputs[valid_idx])

    train_dataset = Data.TensorDataset(train_inputs, train_outputs)
    valid_dataset = Data.TensorDataset(valid_inputs, valid_outputs)
    
    train_loader = Data.DataLoader(train_dataset, CFG['batch_size'], True)
    valid_loader = Data.DataLoader(valid_dataset, CFG['batch_size'], True)

    model = TextNet().to(CFG['device'])
    criterion = nn.CrossEntropyLoss().to(CFG['device'])
    optimizer = optim.Adam(model.parameters(), lr=CFG['lr'])

    for epoch in range(CFG['epochs']):
        for batch_x, batch_y in tqdm(train_loader):
            batch_x, batch_y = batch_x.to(CFG['device']), batch_y.to(CFG['device'])
            ids, masks, segments = batch_x[:,0,:], batch_x[:,1,:], batch_x[:,2,:]

            pred = model(ids, masks, segments)
            loss = criterion(pred, batch_y)
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        true = 0.0
        all = 0.0
    for batch_x, batch_y in tqdm(valid_loader):
        batch_x, batch_y = batch_x.to(CFG['device']), batch_y.to(CFG['device'])
        ids, masks, segments = batch_x[:,0,:], batch_x[:,1,:], batch_x[:,2,:]
        pred_y = torch.max(model(ids, masks, segments), 1)[1].cpu().data.numpy()
        real_y = batch_y.cpu().data.numpy()
        true += float((pred_y == real_y).astype(int).sum())
        all += float(len(real_y))
    accuracy = true / all
    print('fold: ', (fold+1), '| accuracy: %.4f' % accuracy)
    torch.save(model.state_dict(), '/content/drive/MyDrive/data/Emotion_Recognition/tmp/fold_{}.pt'.format(fold))

100%|██████████| 2498/2498 [12:24<00:00,  3.35it/s]
100%|██████████| 625/625 [01:01<00:00, 10.10it/s]


fold:  1 | accuracy: 0.5767


100%|██████████| 2498/2498 [12:25<00:00,  3.35it/s]
100%|██████████| 625/625 [01:01<00:00, 10.09it/s]


fold:  2 | accuracy: 0.5767


100%|██████████| 2498/2498 [12:25<00:00,  3.35it/s]
100%|██████████| 625/625 [01:01<00:00, 10.09it/s]


fold:  3 | accuracy: 0.5767


100%|██████████| 2498/2498 [12:24<00:00,  3.35it/s]
100%|██████████| 625/625 [01:01<00:00, 10.10it/s]


fold:  4 | accuracy: 0.5767


100%|██████████| 2498/2498 [12:24<00:00,  3.35it/s]
100%|██████████| 625/625 [01:01<00:00, 10.10it/s]


fold:  5 | accuracy: 0.5767


# 预测

In [None]:
test_inputs = torch.LongTensor(test_inputs)
test_inputs = test_inputs.transpose(0, 1)
test_dataset = Data.TensorDataset(test_inputs,)
test_loader = Data.DataLoader(test_dataset, CFG['batch_size'], False)


In [None]:

model = TextNet().to(CFG['device'])

In [None]:
pred_ys=[]
for fold in range(CFG['fold_num']): #把训练后的五个模型挨个进行预测
    pred_y = []
    model.load_state_dict(torch.load('/content/drive/MyDrive/data/Emotion_Recognition/tmp/fold_{}.pt'.format(fold))) 
    
    with torch.no_grad():
        for batch_x, in tqdm(test_loader):
            batch_x = batch_x.to(CFG['device'])
            ids, masks, segments = batch_x[:,0,:], batch_x[:,1,:], batch_x[:,2,:]
            output = model(ids, masks, segments)
            pred_y.extend(output.cpu().detach().numpy().tolist())            
    pred_ys.append(pred_y)
pred_y = np.mean(pred_ys, axis=0)
pred=np.argmax(pred_y,axis=1)

100%|██████████| 313/313 [00:30<00:00, 10.11it/s]
100%|██████████| 313/313 [00:30<00:00, 10.10it/s]
100%|██████████| 313/313 [00:30<00:00, 10.13it/s]
100%|██████████| 313/313 [00:30<00:00, 10.13it/s]
100%|██████████| 313/313 [00:30<00:00, 10.12it/s]


In [None]:
headers=['测试数据id','情感极性']
rows=[]
for i,y in enumerate(pred):
   rows.append([i,y-1])

In [None]:
with open('/content/drive/MyDrive/data/Emotion_Recognition/tmp/result.csv','w')as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)