In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('/kaggle/input/mbti-personality-types-500-dataset/MBTI 500.csv')

mbti_counts = df['type'].value_counts()
# 假设df是从文件中正确加载的DataFrame，并且 'type' 列包含了MBTI类型

# 去除NaN值
df = df.dropna(subset=['type'])

# 计算每个分类的数量
categories = ['I/E', 'N/S', 'T/F', 'J/P']
counts = {category: {'I': 0, 'E': 0, 'N': 0, 'S': 0, 'T': 0, 'F': 0, 'J': 0, 'P': 0} for category in categories}

for index, row in df.iterrows():
    mbti_type = row['type']
    counts['I/E'][mbti_type[0]] += 1
    counts['N/S'][mbti_type[1]] += 1
    counts['T/F'][mbti_type[2]] += 1
    counts['J/P'][mbti_type[3]] += 1

# 转换计数到DataFrame准备绘图
df_counts = pd.DataFrame(counts)

df_origin = df
df_origin.rename(columns={'type': 'type_mbti'}, inplace=True)

mbti_type = pd.DataFrame
mbti_type = df['type_mbti']

label_map_all = {
    'ISTJ': 1, 'ISFJ': 2, 'INFJ': 3, 'INTJ': 4,
    'ISTP': 5, 'ISFP': 6, 'INFP': 7, 'INTP': 8,
    'ESTP': 9, 'ESFP': 10, 'ENFP':11, 'ENTP':12,
    'ESTJ': 13, 'ESFJ':14, 'ENFJ':15, 'ENTJ':16
}

label_map_ie = {
    'ISTJ': 1, 'ISFJ': 1, 'INFJ': 1, 'INTJ': 1,
    'ISTP': 1, 'ISFP': 1, 'INFP': 1, 'INTP': 1,
    'ESTP': 0, 'ESFP': 0, 'ENFP': 0, 'ENTP': 0,
    'ESTJ': 0, 'ESFJ': 0, 'ENFJ': 0, 'ENTJ': 0
}
label_map_sn = {
    'ISTJ': 1, 'ISFJ': 1, 'INFJ': 0, 'INTJ': 0,
    'ISTP': 1, 'ISFP': 1, 'INFP': 2, 'INTP': 0,
    'ESTP': 1, 'ESFP': 1, 'ENFP': 0, 'ENTP': 0,
    'ESTJ': 1, 'ESFJ': 1, 'ENFJ': 0, 'ENTJ': 0
}
label_map_tf = {
    'ISTJ': 1, 'ISFJ': 0, 'INFJ': 0, 'INTJ': 1,
    'ISTP': 1, 'ISFP': 0, 'INFP': 0, 'INTP': 1,
    'ESTP': 1, 'ESFP': 0, 'ENFP': 0, 'ENTP': 1,
    'ESTJ': 1, 'ESFJ': 0, 'ENFJ': 0, 'ENTJ': 1
}
label_map_jp = {
    'ISTJ': 1, 'ISFJ': 1, 'INFJ': 1, 'INTJ': 1,
    'ISTP': 0, 'ISFP': 0, 'INFP': 0, 'INTP': 0,
    'ESTP': 0, 'ESFP': 0, 'ENFP': 0, 'ENTP': 0,
    'ESTJ': 1, 'ESFJ': 1, 'ENFJ': 1, 'ENTJ': 1
}

import pandas as pd

# 創建新的 DataFrame，複製舊 DataFrame 中的所有列
new_df = df.copy()

# 使用 map 函數將 MBTI 類型映射為對應的數字或類別，填充到五個新列中
new_df['mbti_all'] = df['type_mbti'].map(label_map_all)
new_df['mbti_ie'] = df['type_mbti'].map(label_map_ie)
new_df['mbti_sn'] = df['type_mbti'].map(label_map_sn)
new_df['mbti_tf'] = df['type_mbti'].map(label_map_tf)
new_df['mbti_jp'] = df['type_mbti'].map(label_map_jp)

# 刪除原來的 'type_mbti' 列
new_df.drop(columns=['type_mbti'], inplace=True)

print(new_df)

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
import torch.nn as nn
import numpy as np
from tqdm import tqdm  # 导入 tqdm 用于进度条
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
! pip install hf_xet

In [None]:
# 加载数据
data = new_df

data = data.iloc[0:10000]#截取数据

# 分割数据
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 创建数据集类
class MBTIDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row['posts']
        labels = [row[f'mbti_{dim}'] for dim in ['ie', 'sn', 'tf', 'jp']]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# 初始化tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 创建数据加载器
max_len = 512
train_dataset = MBTIDataset(train_data, tokenizer, max_len)
test_dataset = MBTIDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
# 搭建模型
class MultiTaskBert(torch.nn.Module):
    def __init__(self):
        super(MultiTaskBert, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bert.gradient_checkpointing_enable()  # 启用检查点
        self.dropout = torch.nn.Dropout(0.1)
        self.classifiers = torch.nn.ModuleList([
            torch.nn.Linear(768, 1) for _ in range(4)
        ])
    
    # def forward(self, input_ids, attention_mask):
    #     outputs = self.bert(input_ids, attention_mask=attention_mask)
    #     pooled_output = outputs[1]
    #     pooled_output = self.dropout(pooled_output)
    #     logits = torch.cat([classifier(pooled_output).squeeze() for classifier in self.classifiers], dim=-1)
    #     return logits
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # [batch_size, 768]
        pooled_output = self.dropout(pooled_output)
        logits = torch.cat([classifier(pooled_output) for classifier in self.classifiers], dim=-1)  # [batch_size, 4]
        return logits

model = MultiTaskBert()

In [None]:
# 设备和模型初始化
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiTaskBert()
if torch.cuda.device_count() > 1:
    print(f"使用 {torch.cuda.device_count()} 块 GPU")
    model = nn.DataParallel(model)  # 启用数据并行
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    # 使用 tqdm 包装 data_loader，显示进度条
    for batch in tqdm(data_loader, desc="Training", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask)
        loss_fn = torch.nn.BCEWithLogitsLoss()
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        optimizer.step()

def eval_model(model, data_loader, device):
    model.eval()
    final_outputs = []
    final_labels = []
    # 使用 tqdm 包装 data_loader，显示进度条
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask)
            final_outputs.append(outputs.cpu().numpy())
            final_labels.append(labels.cpu().numpy())
    
    final_outputs = np.concatenate(final_outputs, axis=0)
    final_labels = np.concatenate(final_labels, axis=0)
    # # 转换为二分类预测
    # preds = (torch.sigmoid(torch.tensor(final_outputs)) > 0.5).numpy()  # [num_samples, 4]
    
    # 转换为二分类预测
    preds = (torch.sigmoid(torch.tensor(final_outputs)) > 0.5).numpy()  # [num_samples, 4]
    
    # 计算每个维度的准确率
    accuracies = []
    dimension_names = ['E/I', 'N/S', 'T/F', 'J/P']
    for dim in range(4):
        acc = accuracy_score(final_labels[:, dim], preds[:, dim])
        accuracies.append(acc)
        print(f"{dimension_names[dim]} 准确率: {acc:.4f}")
    
    return preds, final_labels, accuracies

num_epochs = 3
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_epoch(model, train_loader, optimizer, device)
    preds, final_labels, accuracies = eval_model(model, test_loader, device)
    print(f"第 {epoch + 1} 轮评估完成，准确率: {accuracies}")