In [None]:
import pandas as pd
df = pd.read_csv('all_matched_transcript.csv')
df.info()

In [None]:
df['label'] = df['label'].replace(-1, 0)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer,BertModel,BertConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertPreTrainedModel
from transformers import AutoConfig, AutoTokenizer
from transformers import logging
logging.set_verbosity_error()

from sklearn import metrics
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange

In [None]:
train, test =  train_test_split(df,test_size=0.2,random_state=42)

x_train= list(train['preprocessed_componenttext'])
y_train= list(train['label'])
x_test= list(test['preprocessed_componenttext'])
y_test = list(test['label'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./autodl-tmp/bert-base-uncased')
train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=512)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=512)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(float(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)
train_dataset = NewsDataset(train_encoding, y_train)
test_dataset = NewsDataset(test_encoding, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
#freeze the parameter of pretrained BERT model
class my_bert_model(nn.Module):
    def __init__(self, freeze_bert=True, hidden_size=768):
        super().__init__()
        config = AutoConfig.from_pretrained('./autodl-tmp/bert-base-uncased')
        self.bert = BertModel(config)
        config.update({'output_hidden_states':True})
        self.cls_layer1 = nn.Linear(config.hidden_size,128)
        self.relu1 = nn.ReLU()
        self.ff1 = nn.Linear(128,32)
        self.relu2 = nn.ReLU()
        self.ff2 = nn.Linear(32,1)

        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        all_hidden_states = torch.stack(outputs[2])  
        logits = outputs.last_hidden_state[:,0,:]
        output = self.cls_layer1(logits)
        output = self.relu1(output)
        output = self.ff1(output)
        output = self.relu2(output)
        output = self.ff2(output)
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device,'能用')

In [None]:
model = my_bert_model().to(device)
criterion = nn.BCEWithLogitsLoss().to(device)

In [None]:
optim = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optim, num_warmup_steps = 0,
                        num_training_steps = total_steps)

In [None]:
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:

        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        
        labels = labels.view(-1, 1)
        
        loss = criterion(outputs, labels)                
        total_train_loss += loss.item()
           
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)  
        

        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 1000==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))

In [None]:
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    predicted_label = []
    with torch.no_grad():
        for batch in test_dataloader:

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predicted_label += outputs
            
            labels = labels.view(-1, 1)
        
            loss = criterion(outputs, labels)
            logits = outputs

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    return predicted_label

In [None]:
for epoch in range(5):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    pred=validation()