bert模型训练二分类模型 

数据集（fake and real news dataset）

In [None]:
#import 相关的库
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

import pandas as pd
import random, time
from babel.dates import format_date, format_datetime, format_time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score

import torch
from torch import Tensor
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F

import transformers, os
from transformers import BertModel, AutoModel, AdamW, get_linear_schedule_with_warmup, BertTokenizer, BertForSequenceClassification


In [None]:
#set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#empty cache  
torch.cuda.empty_cache()
# Set the seed value all over the place to make this reproducible.
seed_val = 1234
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
#load data and make dataset
df1 = pd.read_csv('D:/ML_data_sql/news/True.csv')
df2 = pd.read_csv('D:/ML_data_sql/news/Fake.csv')
df1['label'] = 1
df2['label'] = 0
df = pd.concat([df1, df2])

#data clean
def cleandata(data):
    data.text = data.text.str.replace('[#,@,&]','')
    data.text = data.text.str.replace('\d*','')
    data.text = data.text.str.replace('w{3}','')
    data.text = data.text.str.replace('http\S+','')
    data.text = data.text.str.replace('\s+','')
    data.text = data.text.str.replace(r'\s+[a-zA-Z]\s+','')
    #set stopwords
    stop_words = stopwords.words('english')
    stop_words.extend(['u','wa','ha','would','com'])
    data['text'] = data['text'].apply(lambda x: " ".join([word for word in x.split() if word not in (stop_words)]))

cleandata(df)
    
#split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=seed_val, shuffle=True)
X_train_Transformer, X_val_Transformer, y_train_Transformer, y_val_Transformer = train_test_split(
                                                    X_train, y_train, test_size=0.20, random_state=seed_val)

In [None]:
#download pretrained model settings
model_name = 'bert-base-cased'  #TODO
SEQ_LEN = 200
batch_size = 2 
epochs = 1
learning_rate = 1e-5 # Controls how large a step is taken when updating model weights during training.
steps_per_epoch = 100
num_workers = 3

In [None]:
#split the sentence 200 word  a sentence
def split_sentence(sentence):
    l_total = []
    l_parcial = []
    if len(sentence.split())//SEQ_LEN>0:
        n = len(sentence.split())//SEQ_LEN
    else:
        n =1
    for i in range(n):
        if i ==0:
            l_parcial = sentence.split()[:SEQ_LEN]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = sentence.split()[i*SEQ_LEN:(i+1)*SEQ_LEN]
            l_total.append(" ".join(l_parcial))
    return str(l_total)
# Splits train and validation sets to be feed to the transformer which only accepts 512 tokens maximum
split_train_text = [split_sentence(t) for t in X_train_Transformer]
split_valid_text = [split_sentence(t) for t in X_val_Transformer]
split_test_text = [split_sentence(t) for t in X_test]

In [None]:
print(split_valid_text[0],len(split_valid_text[0]))

In [None]:
#load tokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
def encoding(text):
    #encode text
    encoded_text = tokenizer.batch_encode_plus(
        list(text),
        max_length=SEQ_LEN,
        add_special_tokens=True, # Add '[CLS]' and '[SEP]'#使用bert模型必须要有这两个 一个在句头一个在句尾
        return_token_type_ids=True,
        truncation=True,
        padding='longest',
        return_attention_mask=True,)
    return encoded_text

traincoding = encoding(split_train_text)
validcoding = encoding(split_valid_text)
testcoding = encoding(split_test_text)

In [None]:
print(traincoding.keys())

In [None]:
import warnings
warnings.filterwarnings("ignore")
#load the weights
class_wts = compute_class_weight('balanced', np.unique(df['label'].values.tolist()), 
                                 df['label'])

# convert class weights to tensor
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

In [None]:
#load the train data 
def makeDataloader(coding,transformers,batch_size,num_workers):
    coding_seq  = torch.tensor(coding['input_ids'])
    coding_mask = torch.tensor(coding['attention_mask'])
    coding_token_ids = torch.tensor(coding['token_type_ids'])
    coding_label = torch.tensor(transformers.tolist())
    # wrap the data to dataloader
    dataset = TensorDataset(coding_seq, coding_mask, coding_token_ids, coding_label)
    sampler = RandomSampler(dataset)
    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        sampler=sampler,
        pin_memory=True
    ),coding_label

traindata,train_label = makeDataloader(traincoding,y_train_Transformer,batch_size,num_workers)
valdata,val_label = makeDataloader(validcoding,y_val_Transformer,batch_size,num_workers)
testdata,test_label =makeDataloader(testcoding,y_test,batch_size,num_workers)


print('Number of data in the train set', len(traindata),)
print('Number of data in the validation set', len(valdata))
print('Number of data in the test set', len(testdata))
#如果太多 应使用一部分数据进行训练

In [None]:
#load bert model
class BERT_Arch(nn.Module):
    
    def __init__(self, n_classes, freeze_bert=False):
        
        super(BERT_Arch,self).__init__()
        # Instantiating BERT model object
        self.bert = BertModel.from_pretrained(model_name, return_dict=False)
        
        # Freeze bert layers
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
                
        self.bert_drop_1 = nn.Dropout(0.2)
        self.fc = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size) # (768, 64)
        self.bn = nn.BatchNorm1d(768) # (768)
        self.bert_drop_2 = nn.Dropout(0.25)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes) # (768,2)


    def forward(self, input_ids, attention_mask, token_type_ids):
        _, output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids
        )
        output = self.bert_drop_1(output)
        output = self.fc(output)
        output = self.bn(output)
        output = self.bert_drop_2(output)
        output = self.out(output)        
        return output

In [None]:
#traing settings
print('Downloading the BERT custom model...')
#2分类
model = BERT_Arch(2)
model.to(device)
cross_entropy  = nn.CrossEntropyLoss(weight=weights)
#easy optimizer 
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def train():
    print('training... ')
    model.train()
    total_loss, total_accuracy = 0, 0
    # empty list to save model predictions
    total_preds=[]
    # iterate over batches
    for step, batch in enumerate(traindata):
    
        # progress update after every 100 batches.
        if step % 100 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(traindata)))

        if torch.cuda.is_available():
            # push the batch to gpu
            batch = [r.to(device) for r in batch]

        sent_id, mask, token_type_ids, labels = batch
        # clear previously calculated gradients 
        model.zero_grad()        
        # get model predictions for the current batch
        preds = model(sent_id, mask, token_type_ids)
        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)
        # add on to the total loss
        total_loss = total_loss + loss.item()
        # backward pass to calculate the gradients
        loss.backward()
        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update parameters
        optimizer.step()
        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()
        # append the model predictions
        total_preds.append(preds)
        
        torch.cuda.empty_cache()

    # compute the training loss of the epoch
    avg_loss = total_loss / len(traindata)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
#evaluating the model 
def evaluate():
  
    print("\nEvaluating...")
    #t0 = time.time()
    
    model.eval() # deactivate dropout layers
    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(valdata):
        # Progress update every 100 batches.
        if step % 100 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            #elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(valdata)))

        if torch.cuda.is_available():
            # push the batch to gpu
            batch = [t.to(device) for t in batch]

        sent_id, mask, token_type_ids, labels = batch

        # deactivate autograd
        with torch.no_grad(): # Dont store any previous computations, thus freeing GPU space

            # model predictions
            preds = model(sent_id, mask, token_type_ids)
            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds, labels)
            total_loss = total_loss + loss.item()
            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)

        torch.cuda.empty_cache()
    # compute the validation loss of the epoch
    avg_loss = total_loss / len(valdata) 
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

# Empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

# for each epoch perform training and evaluation
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, train_pred = train()
    
    #evaluate model
    valid_loss, val_pred = evaluate()
    
    print('Evaluation done for epoch {}'.format(epoch + 1))
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        print('Saving model...')
        torch.save(model.state_dict(), 'bert_weights.pth') # Save model weight's (you can also save it in .bin format)
   # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    #compute the acc
    train_acc = accuracy_score(train_pred, train_label)
    val_acc = accuracy_score(val_pred, val_label)
    print(f'\nTraining Loss: {train_loss:.3f}, Training Accuracy: {train_acc:.4f}')
    print(f'Validation Loss: {valid_loss:.3f}, Validation Accuracy: {val_acc:.4f}')

In [None]:
# #predict
print('\nTest Set...')

test_preds = []

print('Total batches:', len(testdata))

for fold_index in range(0, 3):
    
    print('\nFold Model', fold_index)
    
    # Load the fold model
    path_model = 'bert_weights.pt'
    model.load_state_dict(torch.load(path_model))

    # Send the model to the GPU
    model.to(device)

    stacked_val_labels = []
    
    # Put the model in evaluation mode.
    model.eval()

    # Turn off the gradient calculations.
    # This tells the model not to compute or store gradients.
    # This step saves memory and speeds up validation.
    torch.set_grad_enabled(False)


    # Reset the total loss for this epoch.
    total_val_loss = 0

    for j, test_batch in enumerate(testdata):

        inference_status = 'Batch ' + str(j + 1)

        print(inference_status, end='\r')

        b_input_ids = test_batch[0].to(device)
        b_input_mask = test_batch[1].to(device)
        b_token_type_ids = test_batch[2].to(device)
        b_test_y = test_batch[3].to(device)


        outputs = model(b_input_ids, 
                        attention_mask=b_input_mask,
                        token_type_ids=b_token_type_ids)

        # Get the preds
        preds = outputs[0]

        # Move preds to the CPU
        val_preds = preds.detach().cpu().numpy()
        acc = accuracy_score(val_preds, b_test_y)
        #true_labels.append(b_test_y.to('cpu').numpy().flatten())
        print(acc)
        # Stack the predictions.
        if j == 0:  # first batch
            stacked_val_preds = val_preds
            
        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))
            
    test_preds.append(stacked_val_preds)
    
            
print('\nPrediction complete.')
for i, item in enumerate(test_preds):
    if i == 0:
        preds = item
    else:
        # Sum the matrices
        preds = item + preds

# Average the predictions
avg_preds = preds/(len(test_preds))

#print(preds)
#print()
#print(avg_preds)

# Take the argmax. 
# This returns the column index of the max value in each row.
test_predictions = np.argmax(avg_preds, axis=1)

# Take a look of the output
print(type(test_predictions))
print(len(test_predictions))
print()
print(test_predictions)

true_y = []
for j, test_batch in enumerate(testdata):
    true_y.append(int(test_batch[3][0].numpy().flatten()))

In [None]:
#visualtion

target_names = ['true_y', 'predicted_y']

data = {'true_y': true_y,
       'predicted_y': test_predictions}

df_pred_BERT = pd.DataFrame(data, columns=['true_y','predicted_y'])

confusion_matrix = pd.crosstab(df_pred_BERT['true_y'], df_pred_BERT['predicted_y'], rownames=['True'], colnames=['Predicted'])

sns.heatmap(confusion_matrix, annot=True)
plt.show()
