In [1]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
import tqdm
from tqdm.notebook import tqdm
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, AutoModelForTokenClassification, DataCollatorForTokenClassification
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
from tqdm import tqdm
from keras_preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, TensorDataset
from pytorch_pretrained_bert import BertModel
import torch
from torch import nn
from torch.optim import Adam


2024-06-22 14:56:28.064048: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-22 14:56:28.103616: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# disable progress bars when submitting
def is_interactive():
   return 'SHLVL' not in os.environ

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

In [3]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [4]:
# 预处理函数
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data


In [8]:
# 读取数据
df = pd.read_csv('../dataset/train-tagged.csv')

# 过滤 identity_annotator_count 大于 0 的行
df_filtered = df[df['identity_annotator_count'] > 0]

# 提取需要的列
texts = df_filtered['comment_text']
labels = df_filtered[['target', 'male', 'female', 'transgender', 'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity', 'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability']].values
labels = labels * 10

texts = preprocess(texts)

# 使用 Tokenizer 进行 tokenization 和序列化
tokenizer = BertTokenizer.from_pretrained('/root/autodl-tmp/bert-base-uncased')

# 将文本转换为token序列
sequences = [tokenizer.encode(text, add_special_tokens=True) for text in tqdm(texts, desc="Tokenizing")]

print("Tokenization completed.")


Tokenization completed.


In [10]:
from keras_preprocessing.sequence import pad_sequences
# 设置序列的最大长度
maxlen = 220
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

# 将数据转换为 Tensor
text_tensor = torch.tensor(padded_sequences, dtype=torch.long, device='cuda:0')
label_tensor = torch.tensor(labels, dtype=torch.float, device='cuda:0')

# 打印 Tensor 的形状以确认
print(text_tensor.shape)
print(label_tensor.shape)


torch.Size([405130, 220])
torch.Size([405130, 25])


In [15]:
print(text_tensor[1])
print(label_tensor[1])

tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [16]:

dataset = TensorDataset(text_tensor, label_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [6]:
class BertFineTuner(nn.Module):
    def __init__(self, bert_model):
        super(BertFineTuner, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(bert_model.config.hidden_size, 25)  # 输出一个标量

    def forward(self, input_ids, attention_mask=None):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask)
        return self.classifier(pooled_output)

bert_model = BertModel.from_pretrained('/root/autodl-tmp/bert-base-uncased')
# 实例化模型
model = BertFineTuner(bert_model)
model = model.cuda()

# 定义优化器
optimizer = Adam(model.parameters(), lr=2e-5)
loss_fn = nn.MSELoss()


In [20]:
for epoch in range(5):  # 假设训练 3 个 epoch
    epoch_loss = 0
    num_batches = len(dataloader)
    progress_bar = tqdm(dataloader, desc=f"Training Epoch {epoch+1}")
    
    for batch in progress_bar:
        batch_inputs, batch_labels = batch
        batch_inputs = batch_inputs.cuda()
        batch_labels = batch_labels.cuda()

        optimizer.zero_grad()

        outputs = model(batch_inputs)
        loss = loss_fn(outputs, batch_labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
        
    avg_loss = epoch_loss / num_batches
    print(f'Epoch {epoch+1}, Average Loss: {avg_loss}')


Training Epoch 1: 100%|██████████| 12661/12661 [38:14<00:00,  5.52it/s, loss=0.141] 


Epoch 1, Average Loss: 0.28955142467922157


Training Epoch 2: 100%|██████████| 12661/12661 [38:13<00:00,  5.52it/s, loss=0.231] 


Epoch 2, Average Loss: 0.2093714432223446


Training Epoch 3: 100%|██████████| 12661/12661 [38:14<00:00,  5.52it/s, loss=0.0558]


Epoch 3, Average Loss: 0.18723126015418534


Training Epoch 4: 100%|██████████| 12661/12661 [38:14<00:00,  5.52it/s, loss=0.0927]


Epoch 4, Average Loss: 0.16501742694725519


Training Epoch 5: 100%|██████████| 12661/12661 [38:18<00:00,  5.51it/s, loss=0.09]  

Epoch 5, Average Loss: 0.14336694270636025





In [21]:
# 保存微调后的模型
torch.save(model.state_dict(), "/root/autodl-tmp/code/models/finetuned_bert_on_identity.pth")

In [7]:
# 加载训练好的模型
model = BertFineTuner(bert_model)
model.load_state_dict(torch.load('/root/autodl-tmp/code/models/finetuned_bert_on_identity.pth'))
model.eval()
model.cuda()

BertFineTuner(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
           

In [8]:
# 读取 CSV 文件
readdata = pd.read_csv('/root/autodl-tmp/dataset/train-tagged.csv')

# 过滤出 identity_annotator_count 列值为 0 的数据
data_to_predict = readdata[readdata['identity_annotator_count'] == 0]


In [None]:
# print(readdata.head(10))

In [11]:
dt = data_to_predict['comment_text']
dt = preprocess(dt)
tokenizer = BertTokenizer.from_pretrained('/root/autodl-tmp/bert-base-uncased')
to_predict = [tokenizer.encode(text, add_special_tokens=True) for text in tqdm(dt, desc="Tokenizing")]
print("Tokenization completed.")


Tokenization completed.


In [12]:
padded_to_predict = pad_sequences(to_predict, maxlen=220)

# 将数据转换为 Tensor
topredict_tensor = torch.tensor(padded_to_predict, dtype=torch.long, device='cuda:0')

# 打印 Tensor 的形状以确认
print(topredict_tensor.shape)


torch.Size([1399744, 220])


In [13]:
dataset = TensorDataset(topredict_tensor)
dataloader = DataLoader(dataset, batch_size=32)

# 预测函数
all_predictions = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="Predicting"):
        batch_inputs = batch[0]
        outputs = model(batch_inputs)
        predictions = outputs.cpu().numpy()
        all_predictions.extend(predictions)


In [14]:
all_predictions = np.array(all_predictions)
all_predictions = all_predictions / 10 # 为了对应先前的x
predictions_df = pd.DataFrame(all_predictions, columns=['label1', 'male', 'female', 'transgender', 'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity', 'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability'])

# 丢弃第一个标签，也就是预测的target值
predictions_df = predictions_df.drop(columns=['label1'])

# 将预测结果写回原数据
readdata.loc[readdata['identity_annotator_count'] == 0, ['male', 'female', 'transgender', 'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity', 'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability']] = predictions_df.values

# 保存更新后的 CSV 文件
readdata.to_csv('/root/autodl-tmp/dataset/train-tagged.csv', index=False)


In [15]:
process_data = pd.read_csv('/root/autodl-tmp/dataset/train-tagged.csv')


In [16]:
# 将值小于0的替换为0，大于1替换为1
columns_to_replace = ['male', 'female', 'transgender', 'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity', 'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability']
process_data[columns_to_replace] = process_data[columns_to_replace] 
process_data[columns_to_replace] = process_data[columns_to_replace].apply(lambda x: x.clip(0, 1))



In [17]:
process_data.to_csv('/root/autodl-tmp/dataset/train-tagged.csv', index=False)


In [18]:
# 查看一下预测好的值
print(process_data.head(10))

       id    target                                       comment_text  \
0   59848  0.000000  This is so cool. It's like, 'would you want yo...   
1   59849  0.000000  Thank you!! This would make my life a lot less...   
2   59852  0.000000  This is such an urgent design problem; kudos t...   
3   59855  0.000000  Is this something I'll be able to install on m...   
4   59856  0.893617               haha you guys are a bunch of losers.   
5   59859  0.666667                               ur a sh*tty comment.   
6   59861  0.457627                        hahahahahahahahhha suck it.   
7   59863  0.000000                                FFFFUUUUUUUUUUUUUUU   
8  239575  0.000000  The ranchers seem motivated by mostly by greed...   
9  239576  0.000000  It was a great show. Not a combo I'd of expect...   

   severe_toxicity   obscene  identity_attack    insult    threat     asian  \
0         0.001123  0.001111         0.000781  0.015544  0.001114  0.001913   
1         0.001383  0.00171