In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import (
    BertModel,
    BertConfig,
    BertTokenizer,
    AdamW,
    BertForSequenceClassification,
    set_seed,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import os
import sys
import time
import datetime

In [2]:
set_seed(42)

In [3]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [19]:
df['cls'] = None
for idx, row in df.iterrows():
    flag = False
    for level_one_tag, level_two_tag_list in level_two_tag.items():
        if row['二级标签'] in level_two_tag_list:
            df['cls'][idx] = level_one_tag
            flag = True
            break
    if flag is False:
        print('二级标签<{}>未找到对应分类'.format(row['二级标签']))

In [52]:
label2id_map = {
    '色情类': 0,
    '广告类': 1,
    '政治类': 2,
    '辱骂类': 3,
    '其他类': 4,
}

In [4]:
df = pd.read_pickle('/data-input/lichunyu/data/5cls.pkl')

In [55]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [6]:
tokenizer = BertTokenizer.from_pretrained('/data-input/lichunyu/bert-base-chinese')

In [56]:
def tokenize_batch(df):
    input_ids = []
    attention_masks = []
    for idx, row in df.iterrows():
        encoded_dict = tokenizer(
                            row['text'], 
                            add_special_tokens = True,
                            truncation='longest_first',
                            max_length = 150,
                            padding = 'max_length',
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )


        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['label'].tolist())

    return input_ids, attention_masks, labels
    

train_input_ids, train_attention_masks, train_labels = tokenize_batch(df_train)
test_input_ids, test_attention_masks, test_labels= tokenize_batch(df_test)

In [58]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

In [59]:
batch_size = 16

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = batch_size
        )

In [67]:
optimizer = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8
                )

epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [12]:
def flat_accuracy(preds, labels):
    
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return accuracy_score(labels_flat, pred_flat)

def flat_f1(preds, labels):

    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    return f1_score(labels_flat, pred_flat, average='micro')


def format_time(elapsed):    

    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [15]:
del(model)
import gc
gc.collect()
torch.cuda.empty_cache()

In [66]:
model = BertForSequenceClassification.from_pretrained(
    '/data-input/lichunyu/bert-base-chinese',
    num_labels = 5,
    output_attentions = False,
    output_hidden_states = False,
)

Some weights of the model checkpoint at /data-input/lichunyu/bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from th

In [68]:
model.to(device)

training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):
    
    print('')
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 16 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        input_ids = batch[0].to(device).to(torch.int64)
        attention_mask = batch[1].to(device).to(torch.int64)
        labels = batch[2].to(device).to(torch.int64)

        model.zero_grad()

        output = model(input_ids=input_ids,
                             attention_mask=attention_mask, 
                             labels=labels)

        loss = output.loss
        logits = output.logits
        total_train_loss += loss.item()

        loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            

    training_time = format_time(time.time() - t0)

    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epcoh took: {:}'.format(training_time))
        
    print('')
    print('Running Validation...')

    t0 = time.time()
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0
    nb_eval_steps = 0

    for batch in test_dataloader:

        input_ids = batch[0].to(device).to(torch.int64)
        attention_mask = batch[1].to(device).to(torch.int64)
        labels = batch[2].to(device).to(torch.int64)
 
        with torch.no_grad():        

            output = model(input_ids,
                           attention_mask=attention_mask,
                           labels=labels)
            loss = output.loss
            logits = output.logits

        total_eval_loss += loss.item()
   
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        total_eval_f1 += flat_f1(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print('  Accuracy: {0:.2f}'.format(avg_val_accuracy))

    avg_val_f1 = total_eval_f1 / len(test_dataloader)
    print('  F1: {0:.2f}'.format(avg_val_f1))

    avg_val_loss = total_eval_loss / len(test_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print('  Validation Loss: {0:.2f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Val_F1' : avg_val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print('')
print('Training complete!')

print('Total training took {:} (h:mm:ss)'.format(format_time(time.time()-total_t0)))


Training...
  Batch    16  of    117.    Elapsed: 0:00:04.
  Batch    32  of    117.    Elapsed: 0:00:09.
  Batch    48  of    117.    Elapsed: 0:00:13.
  Batch    64  of    117.    Elapsed: 0:00:17.
  Batch    80  of    117.    Elapsed: 0:00:22.
  Batch    96  of    117.    Elapsed: 0:00:26.
  Batch   112  of    117.    Elapsed: 0:00:30.

  Average training loss: 0.46
  Training epcoh took: 0:00:32

Running Validation...
  Accuracy: 0.92
  F1: 0.92
  Validation Loss: 0.22
  Validation took: 0:00:03

Training...
  Batch    16  of    117.    Elapsed: 0:00:04.
  Batch    32  of    117.    Elapsed: 0:00:09.
  Batch    48  of    117.    Elapsed: 0:00:13.
  Batch    64  of    117.    Elapsed: 0:00:17.
  Batch    80  of    117.    Elapsed: 0:00:22.
  Batch    96  of    117.    Elapsed: 0:00:26.
  Batch   112  of    117.    Elapsed: 0:00:30.

  Average training loss: 0.08
  Training epcoh took: 0:00:32

Running Validation...
  Accuracy: 0.96
  F1: 0.96
  Validation Loss: 0.15
  Validation to

In [70]:
torch.save(model, '/data-input/lichunyu/models/bert-6301111-96-cls5.pth')

In [73]:
from transformers import WEIGHTS_NAME
output_model_file = os.path.join('/data-input/lichunyu/models/', WEIGHTS_NAME)
torch.save(model.state_dict(), output_model_file)

In [71]:
label2id_map = {
    '色情类': 0,
    '广告类': 1,
    '政治类': 2,
    '辱骂类': 3,
    '其他类': 4
}
id2label = {v: k for k, v in label2id_map.items()}

In [72]:
x = '今天天气不错'
input_ids = []
attention_masks = []

# for i in test:
encoded_dict = tokenizer(
                x, 
                add_special_tokens = True,
                truncation='longest_first',
                max_length = 150,
                padding = 'max_length',
                return_attention_mask = True,
                return_tensors = 'pt',
           )
input_ids.append(encoded_dict['input_ids'])

attention_masks.append(encoded_dict['attention_mask'])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
model.eval()
with torch.no_grad():        

    input_ids = input_ids.to(device).to(torch.int64)
    attention_masks = attention_masks.to(device).to(torch.int64)
    output = model(input_ids=input_ids,
                   attention_mask=attention_masks)

print(output.logits)
print('==========类别==========')
print(id2label[torch.argmax(output.logits).detach().cpu().numpy().tolist()])

tensor([[-1.2235, -1.3230, -0.6740, -0.7727,  3.4490]], device='cuda:0')
其他类


In [48]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text,cls
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,199,199
1,318,318
2,547,547
3,340,340


In [50]:
with open('/data-input/lichunyu/data/chat.txt', 'r') as f:
    chat_dt = f.read().splitlines()
    
chat_dt_500 = chat_dt[:800]
chat_dt_clean = [d for d in chat_dt_500 if len(d) > 6]
len(chat_dt_clean)

582

In [53]:
dt_chat = {
    'text': chat_dt_clean,
    'cls': len(chat_dt_clean)*['其他'],
    'label': len(chat_dt_clean)*[4]
}
df_chat = pd.DataFrame(dt_chat)
df_chat

Unnamed: 0,text,cls,label
0,六人行 第1季 第01集 莫妮卡的新室友,其他,4
1,没什么好说的！ 他不过是我的同事！,其他,4
2,少来了，你和那个人一起出去！,其他,4
3,和你交往的男人一定有问题！,其他,4
4,打住，乔伊，嘴下留德,其他,4
...,...,...,...
577,真的?我不记得女人也会制造精子,其他,4
578,我们都知道那是个极大的挑战,其他,4
579,够了，你们两个，别吵了,其他,4
580,"不，功劳全让她抢了,我也在场",其他,4


In [None]:
df_train

In [54]:
df = pd.concat([df, df_chat])
df

Unnamed: 0,text,cls,label
0,什么烂剧 卧槽,辱骂类,3
1,妈的，颠覆三观的绿，看的想吐?,辱骂类,3
2,怪不得你们爱豆越活越娘炮，玩个游戏用力一点就叫的跟狗似的，还是去玩泥巴去吧,辱骂类,3
3,身边有一婊子弄的我实在无奈!怎么对付这种人kao!,辱骂类,3
4,关你球事？你晓得个锤子。,辱骂类,3
...,...,...,...
577,真的?我不记得女人也会制造精子,其他,4
578,我们都知道那是个极大的挑战,其他,4
579,够了，你们两个，别吵了,其他,4
580,"不，功劳全让她抢了,我也在场",其他,4


In [74]:
!pwd

/
