In [1]:
import os 
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

In [2]:
train = pd.read_csv(Path('data').joinpath('train.csv'))
train = train[['Prompt', 'Answer', 'Target']]
# 统计各特征为null的数量
print(train.isnull().sum())
# 填充nan值
train.loc[train['Answer'].isna(), 'Answer'] = 'NAN'

Prompt     0
Answer    19
Target     0
dtype: int64


In [3]:
text_columns = ['Prompt', 'Answer']

train['Text'] = train[text_columns].agg(' '.join, axis=1)
train['len'] = train['Text'].apply(lambda x: len(x))

train.head()

Unnamed: 0,Prompt,Answer,Target,Text,len
0,[INST] You are an AI assistant that helps peop...,Step-by-step reasoning process:\n1. Randy spen...,0,[INST] You are an AI assistant that helps peop...,1043
1,[INST] You are an AI assistant. You will be gi...,What is the temperature at which hypothermia b...,0,[INST] You are an AI assistant. You will be gi...,2285
2,[INST] You are an AI assistant. You will be gi...,Answer: c) No. \n\nThe hypothesis is false bec...,0,[INST] You are an AI assistant. You will be gi...,1060
3,[INST] You are an AI assistant. User will you ...,Prismatoid,0,[INST] You are an AI assistant. User will you ...,3519
4,[INST] You are an AI assistant. User will you ...,Case B,0,[INST] You are an AI assistant. User will you ...,1449


In [4]:
# 加载bert tokenize模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 5000

# bert使用示例
encoding = tokenizer.encode_plus(
    train['Text'][0],
    add_special_tokens=True,
    max_length=max_len,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)
print(encoding['input_ids'].flatten())
print(type(tokenizer))

tensor([  101,  1031, 16021,  ...,     0,     0,     0])
<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>


In [5]:
from typing import List, Dict, Any


class QADataset(Dataset):
    def __init__(self, texts: List, labels: List, tokenizer: Any, max_len: int) -> None:
        super().__init__()
        self.texts = texts
        self.lables = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx) -> Dict:
        text = self.texts[idx]
        label = self.lables[idx]
        
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label).type(torch.LongTensor)
        }

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    train['Text'].tolist(),
    train['Target'].tolist(),
    test_size=0.1,
    random_state=42
)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512
train_dataset = QADataset(X_train, y_train, tokenizer, max_len)
test_dataset = QADataset(X_test, y_test, tokenizer, max_len)

# dataset使用示例
train_dataset[0]['input_ids']

tensor([  101,  1031, 16021,  2102,  1033,  2017,  2024,  2019,  9932,  3353,
         1012,  5310,  2097,  2017,  2507,  2017,  1037,  4708,  1012,  2115,
         3125,  2003,  2000,  3143,  1996,  4708,  2004, 11633,  2135,  2004,
         2017,  2064,  1012,  2096,  4488,  1996,  4708,  2228,  3357,  1011,
         2011,  1011,  3357,  1998, 16114,  2115,  4084,  1012,  1031,  1013,
        16021,  2102,  1033,  3335,  5143, 11586,  2098,  2041,  1010,  1998,
         6871,  2000,  6814,  2008,  2016,  2018,  2053,  2051,  2000,  2228,
         2127,  2016,  2001,  2006,  2014,  2126,  2188,  2247,  1996,  4064,
         2346,  2461, 14855, 22426,  2012,  5408,  1051,  1005,  5119,  2008,
         2305,  1012,  2059,  2009,  7537,  2014,  2000,  2131,  2041,  1997,
         2014,  6174, 17980,  1998,  3328,  1012,  2045,  2001,  2019,  6728,
        27581,  4231,  1010,  1996,  6565,  4564, 21439,  2091,  2000,  1996,
         8575,  2020,  2035,  4462,  1998, 21666,  1010,  1998, 

In [7]:
# 创建基于Bert的二分类神经网络

class BertClassifier(nn.Module):
    def __init__(self, n_classes: int) -> None:
       super(BertClassifier, self).__init__()
       # 加载预训练模型
       self.bert = BertModel.from_pretrained('bert-base-uncased')
       # 过拟合预防
       self.dropout = nn.Dropout(p=0.3)
       # 构建输出层
       self.fc = nn.Linear(self.bert.config.hidden_size, n_classes) 
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask= attention_mask)
        pooled_output = outputs[1] # 获取 [CLS] 输出
        output = self.dropout(pooled_output)
        return self.fc(output)

In [8]:
bert = BertModel.from_pretrained('bert-base-uncased')
print(bert.config)

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertClassifier(n_classes=2).to(device)

from torch.optim import AdamW
from tqdm import tqdm

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to(device)


model.train()
for epoch in range(1000):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0:   1%|          | 16/1878 [00:15<29:34,  1.05it/s, loss=0.112] 


KeyboardInterrupt: 