In [3]:
import pandas as pd
import csv
import codecs
import numpy as np
from sklearn.model_selection import train_test_split
x_train = np.array(pd.read_csv('../input/hust-ml/train1.csv', dtype=str, encoding='utf-8', usecols=[0])).ravel().tolist()
train_label = np.array(pd.read_csv('../input/hust-ml/train1.csv', dtype=str, encoding='utf-8', usecols=[1])).ravel().tolist()
x_test = np.array(pd.read_csv('../input/hust-ml/test1.csv', dtype=str, encoding='utf-8', usecols=[0])).ravel().tolist()
test_label = [x.strip() for x in codecs.open('../input/hust-ml/ans.txt')]

# news_text = np.array(pd.read_csv('../input/hust-ml/train1.csv', dtype=str, encoding='utf-8', usecols=[0])).ravel().tolist()
# news_label = np.array(pd.read_csv('../input/hust-ml/train1.csv', dtype=str, encoding='utf-8', usecols=[1])).ravel().tolist()
# x_train, x_test, train_label, test_label =  train_test_split(news_text[:], news_label[:], test_size=0.2, stratify=news_label[:])
print(np.array(x_test).shape)
print(np.array(test_label).shape)

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re
from transformers import BertTokenizer
# 分词器，词典

tokenizer = BertTokenizer.from_pretrained('../input/huggingface-bert/bert-base-chinese')
train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=512)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=512)

In [5]:
print('Tokenized: ', tokenizer.tokenize(x_train[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x_train[0])))

In [6]:
# 数据集读取
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # 读取单个样本
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]) + 2)
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encoding, train_label)
test_dataset = NewsDataset(test_encoding, test_label)
train_dataset[15]

In [7]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = BertForSequenceClassification.from_pretrained('../input/huggingface-bert/bert-base-chinese', num_labels=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# 优化方法
epoches = 3
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    
    # Filter for parameters which *do* include those.
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optim = AdamW(optimizer_grouped_parameters, lr=2e-5)
total_steps = len(train_loader) * epoches
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [8]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [9]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [10]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [11]:
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # 反向梯度信息
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    t0 = time.time()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    validation_time = format_time(time.time() - t0)
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("Validation took: {:}".format(validation_time))
    print("-------------------------------")

def predict():
    model.eval()
    predictions = []
    for batch in test_dataloader:
        with torch.no_grad():
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        predict = np.argmax(logits, axis=1).flatten()
        for pred in predict:
            predictions.append(pred)
    
    with open("ans.txt", "w") as f:
        for pred in predictions:
            f.write(str(pred - 2))
            f.write('\n')
    print("-------------------------------")
    print("predict finished.")

In [12]:
for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    # validation()
    predict()