In [3]:
# use_cuda
from transformers import BertModel, BertTokenizer
BERT_PATH = './'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
print(tokenizer.tokenize('I have a good time, thank you.'))
bert = BertModel.from_pretrained(BERT_PATH)
print('load bert model over')

['i', 'have', 'a', 'good', 'time', ',', 'thank', 'you', '.']
load bert model over


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
text = 'I will watch Memento tonight'
bert_input = tokenizer(text, padding='max_length', max_length=10, truncation=True, return_tensors='pt')

print(bert_input['input_ids'])

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tensor([[  101,   146,  1209,  2824,  2508, 26173,  3568,   102,     0,     0]])


In [6]:
print(bert_input['token_type_ids']) 
print(bert_input['attention_mask'])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


In [7]:
from torch import nn
import torch



class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 7)
        self.relu = nn.ReLU()
    def forward(self, input_id, mask):
        _, pooled_out = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        # print(pooled_out.size())
        dropout_output = self.dropout(pooled_out)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [8]:
import numpy as np
import torch.utils.data as data

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
# {'business', 'entertainment', 'politics', 'sport', 'tech'}
labels={'business':0,'entertainment':1,'politics':2,'sport':3,'tech':4}


In [9]:
class Dataset(data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(
            text,
            padding='max_length',
            max_length = 20,
            truncation = True,
            return_tensors="pt"
        ) for text in df['text']]
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_text(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_text(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


In [19]:
from torch.optim import Adam
import torch.utils
import torch.utils.data
from tqdm import tqdm
import matplotlib.pyplot as plt

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)

    val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    train_loss=[]
    train_acc=[]
    val_loss=[]
    val_acc=[]
    EPOCH=[]

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0
        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        train_loss.append(total_loss_train / len(train_data['text']))
        train_acc.append(total_acc_train / len(train_data['text']))
        val_loss.append(total_loss_val / len(val_data['text']))
        val_acc.append(total_acc_val / len(val_data['text']))
        EPOCH.append(epoch_num+1)
        print(
            f'''Epochs: {epoch_num + 1} 
              | Train Loss: {total_loss_train / len(train_data['text']): .3f} 
              | Train Accuracy: {total_acc_train / len(train_data['text']): .3f} 
              | Val Loss: {total_loss_val / len(val_data['text']): .3f} 
              | Val Accuracy: {total_acc_val / len(val_data['text']): .3f}''')
 
    print("saving bert model......")
    torch.save(model.state_dict(),'../bert-base-cased/bert_trained_snips_full.pt')
 
    #画图
    plt.plot(EPOCH,train_loss,'b',label='train_loss')
    plt.plot(EPOCH, train_acc,'g',label='train_acc')
    plt.plot(EPOCH, val_loss, 'r', label='val_loss')
    plt.plot(EPOCH, val_acc, 'c', label='val_acc')
    plt.show()


In [11]:

def evaluate(model, test_data):
    test = Dataset(test_data)
    length=len(test_data['text'])
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
 
    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    print(f'Test Accuracy: {total_acc_test / length: .3f}')

In [12]:
import pandas as pd
file = pd.read_csv('./bbc-text.csv')
file

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [13]:
set(file['category'])

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [14]:
np.random.seed(112)

df_train, df_val, df_test = np.split(file.sample(frac=1, random_state=42), [int(.8*len(file)), int(.9*len(file))])
print(len(df_train), len(df_val), len(df_test))

1780 222 223


  return bound(*args, **kwds)


In [20]:
model = BertClassifier()
EPOCHS = 5
LR = 1e-6
train(model, df_train, df_val, LR, EPOCHS)


  0%|          | 0/56 [00:00<?, ?it/s][A


RuntimeError: expected scalar type Long but found Int