In [2]:
from torch.utils.data import Dataset,DataLoader,random_split
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import optim
from transformers import BertTokenizer,BertModel
from sklearn.metrics import accuracy_score
from torch.utils.tensorboard import SummaryWriter

In [3]:
train_df = pd.read_csv('data/train_dataset.csv',usecols=['text','label'])
print(train_df.shape)
train_df.head()
sentences = list(train_df['text'])
labels =train_df['label'].values

(5000, 2)


In [4]:
def flat_accuracy(preds,labels):
    pred_flat=np.argmax(preds,axis=1).flatten()
    labels_flat=labels.flatten()
    return accuracy_score(labels_flat,pred_flat)

In [5]:
class DataToDataset(Dataset):
    def __init__(self,sentences,labels):
        tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
        max_length=600
        self.encoding=tokenizer(sentences,padding=True,truncation=True,max_length=max_length,return_tensors='pt')
        self.labels=torch.tensor(labels)
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self,index):
        return self.encoding['input_ids'][index],self.encoding['attention_mask'][index],self.labels[index]

In [7]:
datasets=DataToDataset(sentences,labels)
train_size=int(len(datasets)*0.8)
test_size=len(datasets)-train_size
train_dataset,val_dataset=random_split(dataset=datasets,lengths=[train_size,test_size])
BATCH_SIZE=32
train_loader=DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True)
val_loader=DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=True)

In [8]:
class BertTextClassficationModel(nn.Module):
    def __init__(self):
        super(BertTextClassficationModel,self).__init__()
        self.bert=BertModel.from_pretrained('bert-base-uncased')
        self.classification_head=nn.Sequential(nn.Linear(768,200),nn.ReLU(inplace=True),nn.Linear(200,2),nn.ReLU(inplace=True))
        
    def forward(self,ids,mask):
        out=self.bert(input_ids=ids,attention_mask=mask).last_hidden_state
        out=self.classification_head(out[:,0,:])
        return out

In [9]:
loss_func=nn.CrossEntropyLoss()
model=BertTextClassficationModel()
optimizer=optim.Adam(model.parameters(),lr=0.0001)
device="cpu"
writer=SummaryWriter('./logs')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
epochs=10
for epoch in range(epochs):
    train_loss = 0.0
    train_acc=0.0
    for i,data in enumerate(train_loader):
        print(epoch,i)
        input_ids,attention_mask,labels=[elem.to(device) for elem in data]
        #优化器置零
        optimizer.zero_grad()
        #得到模型的结果
        out=model(input_ids,attention_mask)
        #计算误差
        loss=loss_func(out,labels)
        writer.add_scalar('loss',loss,epoch)
        train_loss += loss.item()
        #误差反向传播
        loss.backward()
        #更新模型参数
        optimizer.step()
        #计算acc 
        out=out.detach().numpy()
        labels=labels.detach().numpy()
        train_acc+=flat_accuracy(out,labels)

    print("train %d/%d epochs Loss:%f, Acc:%f" %(epoch,epochs,train_loss/(i+1),train_acc/(i+1)))

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
0 56
0 57
0 58
0 59
0 60
0 61
0 62
0 63
0 64
0 65
0 66
0 67
0 68
0 69
0 70
0 71
0 72
0 73
0 74
0 75
0 76
0 77
0 78
0 79
0 80
0 81
0 82
0 83
0 84
0 85
0 86
0 87
0 88
0 89
0 90
0 91
0 92
0 93
0 94
0 95
0 96
0 97
0 98
0 99
0 100
0 101
0 102
0 103
0 104
0 105
0 106
0 107
0 108
0 109
0 110
0 111
0 112
0 113
0 114
0 115
0 116
0 117
0 118
0 119
0 120
0 121
0 122
0 123
0 124
train 0/3 epochs Loss:0.027788, Acc:0.992000
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
1 22
1 23
1 24
1 25
1 26
1 27
1 28
1 29
1 30
1 31
1 32
1 33
1 34
1 35
1 36
1 37
1 38
1 39
1 40
1 41
1 42
1 43
1 44
1 45
1 46
1 47
1 48
1 49
1 50
1 51
1 52
1 53
1 54
1 55
1 56
1 57
1 58
1 59
1 60
1 61
1 62
1 63
1 64


In [11]:
val_loss=0
val_acc=0
model.eval()
for j,batch in enumerate(val_loader):
    val_input_ids,val_attention_mask,val_labels=[elem.to(device) for elem in batch]
    with torch.no_grad():
        pred=model(val_input_ids,val_attention_mask)
        val_loss+=loss_func(pred,val_labels)
        pred=pred.detach().cpu().numpy()
        val_labels=val_labels.detach().cpu().numpy()
        val_acc+=flat_accuracy(pred,val_labels)
print("evaluate loss:%d, Acc:%d" %(val_loss/len(val_loader),val_acc/len(val_loader)))

evaluate loss:0, Acc:1


In [12]:
writer.close()