In [1]:
import os 
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

# 数据准备

In [2]:
train = pd.read_csv(Path('data').joinpath('train.csv'))
train = train[['Prompt', 'Answer', 'Target']]
# 统计各特征为null的数量
print(train.isnull().sum())
# 填充nan值
train.loc[train['Answer'].isna(), 'Answer'] = 'NAN'

Prompt     0
Answer    19
Target     0
dtype: int64


In [3]:
text_columns = ['Prompt', 'Answer']

train['Text'] = train[text_columns].agg(' '.join, axis=1)
train['len'] = train['Text'].apply(lambda x: len(x))

train.head()

Unnamed: 0,Prompt,Answer,Target,Text,len
0,[INST] You are an AI assistant that helps peop...,Step-by-step reasoning process:\n1. Randy spen...,0,[INST] You are an AI assistant that helps peop...,1043
1,[INST] You are an AI assistant. You will be gi...,What is the temperature at which hypothermia b...,0,[INST] You are an AI assistant. You will be gi...,2285
2,[INST] You are an AI assistant. You will be gi...,Answer: c) No. \n\nThe hypothesis is false bec...,0,[INST] You are an AI assistant. You will be gi...,1060
3,[INST] You are an AI assistant. User will you ...,Prismatoid,0,[INST] You are an AI assistant. User will you ...,3519
4,[INST] You are an AI assistant. User will you ...,Case B,0,[INST] You are an AI assistant. User will you ...,1449


In [6]:
# 加载bert tokenize模型
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.system('export HF_ENDPOINT')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 5000

# bert使用示例
encoding = tokenizer.encode_plus(
    train['Text'][0],
    add_special_tokens=True,
    max_length=max_len,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)
print(encoding['input_ids'].flatten())
print(type(tokenizer))

tensor([  101,  1031, 16021,  ...,     0,     0,     0])
<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>


# 创建实验需要用的dataset

In [7]:
from typing import List, Dict, Any


class QADataset(Dataset):
    def __init__(self, texts: List, labels: List, tokenizer: Any, max_len: int) -> None:
        super().__init__()
        self.texts = texts
        self.lables = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx) -> Dict:
        text = self.texts[idx]
        label = self.lables[idx]
        
        encoding = self.tokenizer.encode_plus(
            text, 
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label).type(torch.LongTensor)
        }

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    train['Text'].tolist(),
    train['Target'].tolist(),
    test_size=0.1,
    random_state=42
)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512
train_dataset = QADataset(X_train, y_train, tokenizer, max_len)
test_dataset = QADataset(X_test, y_test, tokenizer, max_len)

# dataset使用示例
train_dataset[0]['input_ids']

tensor([  101,  1031, 16021,  2102,  1033,  2017,  2024,  2019,  9932,  3353,
         1012,  5310,  2097,  2017,  2507,  2017,  1037,  4708,  1012,  2115,
         3125,  2003,  2000,  3143,  1996,  4708,  2004, 11633,  2135,  2004,
         2017,  2064,  1012,  2096,  4488,  1996,  4708,  2228,  3357,  1011,
         2011,  1011,  3357,  1998, 16114,  2115,  4084,  1012,  1031,  1013,
        16021,  2102,  1033,  3335,  5143, 11586,  2098,  2041,  1010,  1998,
         6871,  2000,  6814,  2008,  2016,  2018,  2053,  2051,  2000,  2228,
         2127,  2016,  2001,  2006,  2014,  2126,  2188,  2247,  1996,  4064,
         2346,  2461, 14855, 22426,  2012,  5408,  1051,  1005,  5119,  2008,
         2305,  1012,  2059,  2009,  7537,  2014,  2000,  2131,  2041,  1997,
         2014,  6174, 17980,  1998,  3328,  1012,  2045,  2001,  2019,  6728,
        27581,  4231,  1010,  1996,  6565,  4564, 21439,  2091,  2000,  1996,
         8575,  2020,  2035,  4462,  1998, 21666,  1010,  1998, 

# 构建模型

In [9]:
# 创建基于Bert的二分类神经网络

class BertClassifier(nn.Module):
    def __init__(self, n_classes: int) -> None:
       super(BertClassifier, self).__init__()
       # 加载预训练模型
       self.bert = BertModel.from_pretrained('bert-base-uncased')
       # 过拟合预防
       self.dropout = nn.Dropout(p=0.3)
       # 构建输出层
       self.fc = nn.Linear(self.bert.config.hidden_size, n_classes) 
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask= attention_mask)
        pooled_output = outputs[1] # 获取 [CLS] 输出
        output = self.dropout(pooled_output)
        return self.fc(output)

In [None]:
bert = BertModel.from_pretrained('bert-base-uncased')
print(bert.config)

# 训练模型

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertClassifier(n_classes=2).to(device)

from torch.optim import AdamW
from tqdm import tqdm

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to(device)


model.train()
for epoch in range(5):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 1878/1878 [06:21<00:00,  4.93it/s, loss=0.0166] 
Epoch 1: 100%|██████████| 1878/1878 [06:20<00:00,  4.94it/s, loss=0.0511] 
Epoch 2: 100%|██████████| 1878/1878 [06:20<00:00,  4.94it/s, loss=0.288]  
Epoch 3: 100%|██████████| 1878/1878 [06:22<00:00,  4.91it/s, loss=1.35]    
Epoch 4: 100%|██████████| 1878/1878 [06:29<00:00,  4.82it/s, loss=0.0973]  


# 测试模型

In [36]:
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask)
        outputs = F.softmax(outputs, dim=1)
        pred, _ = torch.max(outputs, dim=1)
        predictions.extend(pred.cpu().numpy())
        
print(predictions)

[0.9997974, 0.99938166, 0.9998555, 0.9991647, 0.9101605, 0.99981874, 0.9998423, 0.9997664, 0.9998012, 0.9998374, 0.99980694, 0.99972016, 0.9998449, 0.9998343, 0.99985874, 0.9998142, 0.9998379, 0.99867404, 0.98295987, 0.9992642, 0.9998449, 0.90840936, 0.9998523, 0.9995541, 0.99983954, 0.8629661, 0.99985003, 0.99983656, 0.99985516, 0.9918059, 0.95497334, 0.8323113, 0.80639964, 0.99984586, 0.99980396, 0.82354903, 0.99979514, 0.99983704, 0.9998504, 0.9998266, 0.9998265, 0.8382818, 0.9941672, 0.9998035, 0.9998235, 0.9997876, 0.99979824, 0.9998368, 0.99960154, 0.7867989, 0.99984777, 0.99977964, 0.9998142, 0.99984324, 0.9918542, 0.9998254, 0.99981505, 0.99985325, 0.99984694, 0.99984896, 0.82277787, 0.9996106, 0.99975806, 0.99982977, 0.99977034, 0.9997874, 0.9998547, 0.9998423, 0.9994511, 0.99983394, 0.9998548, 0.9998435, 0.99969995, 0.99961, 0.9998367, 0.9997918, 0.99983716, 0.99984217, 0.98212445, 0.9983127, 0.99983513, 0.9996081, 0.9997135, 0.9998241, 0.9998323, 0.9998516, 0.9998066, 0.9992

In [None]:
from sklearn.metrics import classification_report

print("Classification Report:\n", classification_report(y_test, predictions))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# 测试赛题

In [33]:
test = pd.read_csv(Path('data').joinpath('test.csv'))

test.loc[test['Answer'].isna(), 'Answer'] = 'NAN'

text = ['Prompt', 'Answer']

test['Text'] = test[text_columns].agg(' '.join, axis=1)

In [35]:
X_test = test['Text'].tolist()
test_id = test['Id']
print(len(X_test))

test_dataset = QADataset(X_test, y_test, tokenizer, max_len=512)

11125


In [37]:
predictions = [1 - pred for pred in predictions]
ans_df = pd.concat([pd.Series(test_id), pd.Series(predictions)], axis=1).rename(columns={0: 'Target'})
print(ans_df)
ans_df.to_csv('dl_predict.csv', index=False)

          Id    Target
0      20568  0.000203
1      17686  0.000618
2      13035  0.000144
3      22646  0.000835
4       5535  0.089840
...      ...       ...
11120  12033  0.095689
11121   8685  0.000180
11122  25654  0.001046
11123   9039  0.000826
11124  26413  0.352951

[11125 rows x 2 columns]
