In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import os

# 仅本地模式（无网络）
os.environ["TRANSFORMERS_OFFLINE"] = "1"

In [26]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
model_path = 'models/google-bert/bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(
        model_path, 
        num_labels=3,  # 假设3分类任务，根据你的实际任务调整
        local_files_only=True,
        ignore_mismatched_sizes=True  # 允许分类头尺寸不匹配
    )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




In [None]:
# Load data
def load_data():
    X_train = pd.read_csv('output/2025_up_to_month_2/X_train.csv')
    y_train = pd.read_csv('output/2025_up_to_month_2/y_train.csv')
    X_test = pd.read_csv('output/2025_up_to_month_2/X_test.csv')
    y_test = pd.read_csv('output/2025_up_to_month_2/y_test.csv')
    label_mapping = pd.read_csv('output/2025_up_to_month_2/label_mapping.csv')
    
    # 期望列：y_* 至少包含 label, linked_items；若存在 item_title 列则一并使用
    expected_cols = ['label', 'linked_items']
    for name, y in [('y_train', y_train), ('y_test', y_test)]:
        for col in expected_cols:
            if col not in y.columns:
                raise ValueError(f"{name} 缺少必需列: {col}")
    
    return X_train, y_train, X_test, y_test, label_mapping

# 兼容原单任务数据集（保留）
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize text
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# CoT 多任务数据集：输入 case_title + performed_work，输出两种标签
class CoTDataset(Dataset):
    def __init__(self, X_df, labels_linked, labels_item, tokenizer, max_length=128):
        self.texts = (
            X_df.iloc[:, 0].astype(str) + ' [SEP] ' + X_df.iloc[:, 1].astype(str)
        ).tolist()
        self.labels_linked = labels_linked
        self.labels_item = labels_item
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label_linked = int(self.labels_linked[idx])
        label_item = int(self.labels_item[idx])

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels_linked': torch.tensor(label_linked, dtype=torch.long),
            'labels_item': torch.tensor(label_item, dtype=torch.long)
        }

# 多任务 BERT：同时预测 linked_items 与 item_title
class MultiTaskBert(nn.Module):
    def __init__(self, model_path, num_labels_linked, num_labels_item):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_path, local_files_only=True)
        hidden = self.bert.config.hidden_size
        self.dropout = nn.Dropout(0.1)
        self.classifier_linked = nn.Linear(hidden, num_labels_linked)
        self.classifier_item = nn.Linear(hidden, num_labels_item)

    def forward(self, input_ids, attention_mask, labels_linked=None, labels_item=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0]
        cls = self.dropout(cls)
        logits_linked = self.classifier_linked(cls)
        logits_item = self.classifier_item(cls)

        loss = None
        if labels_linked is not None and labels_item is not None:
            ce = nn.CrossEntropyLoss()
            loss = ce(logits_linked, labels_linked) + 0.3 * ce(logits_item, labels_item)
        return {
            'loss': loss,
            'logits_linked': logits_linked,
            'logits_item': logits_item
        }

# 训练（CoT）
def train_model_cot(model, train_loader, val_loader, optimizer, num_epochs=5, device='cuda'):
    model.train()
    train_losses, val_losses, val_acc_linked = [], [], []
    for epoch in range(num_epochs):
        total_train_loss = 0.0
        model.train()
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_linked = batch['labels_linked'].to(device)
            labels_item = batch['labels_item'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            labels_linked=labels_linked, labels_item=labels_item)
            loss = outputs['loss']
            loss.backward()
            optimizer.step()
            total_train_loss += float(loss.item())
        avg_train = total_train_loss / max(len(train_loader), 1)
        train_losses.append(avg_train)

        # 验证
        model.eval()
        total_val_loss = 0.0
        all_preds_linked, all_labels_linked = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels_linked = batch['labels_linked'].to(device)
                labels_item = batch['labels_item'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                                labels_linked=labels_linked, labels_item=labels_item)
                loss = outputs['loss']
                total_val_loss += float(loss.item())
                preds_linked = torch.argmax(outputs['logits_linked'], dim=1)
                all_preds_linked.extend(preds_linked.cpu().numpy())
                all_labels_linked.extend(labels_linked.cpu().numpy())
        avg_val = total_val_loss / max(len(val_loader), 1)
        val_losses.append(avg_val)
        val_acc = accuracy_score(all_labels_linked, all_preds_linked) if all_labels_linked else 0.0
        val_acc_linked.append(val_acc)
        print(f'Epoch {epoch+1}: Train {avg_train:.4f} | Val {avg_val:.4f} | ValAcc(linked) {val_acc:.4f}')
    return train_losses, val_losses, val_acc_linked

# 测试（主任务）
def evaluate_model_cot(model, test_loader, device='cuda'):
    model.eval()
    all_preds_linked, all_labels_linked = [], []
    all_preds_item, all_labels_item = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_linked = batch['labels_linked'].to(device)
            labels_item = batch['labels_item'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            labels_linked=labels_linked, labels_item=labels_item)
            preds_linked = torch.argmax(outputs['logits_linked'], dim=1)
            preds_item = torch.argmax(outputs['logits_item'], dim=1)
            all_preds_linked.extend(preds_linked.cpu().numpy())
            all_labels_linked.extend(labels_linked.cpu().numpy())
            all_preds_item.extend(preds_item.cpu().numpy())
            all_labels_item.extend(labels_item.cpu().numpy())
    acc_linked = accuracy_score(all_labels_linked, all_preds_linked) if all_labels_linked else 0.0
    acc_item = accuracy_score(all_labels_item, all_preds_item) if all_labels_item else 0.0
    print(f'Test Accuracy (linked_items): {acc_linked:.4f}')
    print(f'Test Accuracy (item_title):  {acc_item:.4f}')
    return acc_linked, acc_item, all_preds_linked, all_labels_linked

In [None]:
model, accuracy = main()
print(f"\nFinal test accuracy: {accuracy:.4f}")

Loading data...
Train data shape: (6244, 3)
Train labels shape: (6244, 1)
Test data shape: (7670, 3)
Test labels shape: (7670, 1)
Number of unique labels: 1161

Sample training data:
                                          case_title  \
0                                           制动系统故障报警   
1                                            中央显示屏黑屏   
2                                   购买远程温控后车机上没有安装软件   
3  [CIC][My Car][Vehicle status][PaCC]使用问题-保养提醒/报...   
4                                               底盘异响   

                                      performed_work    month  
0  1. 车辆显示制动系统报警，客户刚提的新车， 电脑测试无相关故障代码\n2.检查车辆版本也是...  2025-01  
1  1车辆进店检查中央显示屏黑屏2检查车辆未发现加装 改装 刷隐藏3此车在其他店断电 强制重启操...  2025-01  
2    客户12月22号购买远程温控，车机端没有推送软件。我们尝试编程，给车辆充电。依旧没有安装...  2025-01  
3  1.客户表示12月8日APP提示有传动系统检查提示，有更换过燃油滤芯，12.9日到山东临沂通...  2025-01  
4  陪同客户试车确认异响存在车辆在低速转向及倒车转向时有异响存在，\n举升车辆检查发现两前横向摆...  2025-01  

Sample training labels:
   linked_items
0           409
1           647
2          1030
3      

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded local tokenizer and model

Training model...


Epoch 1/5: 100%|██████████| 391/391 [59:21<00:00,  9.11s/it]


In [None]:
def main():
    # Load data
    print("Loading data...")
    X_train, y_train, X_test, y_test, label_mapping = load_data()

    print(f"Train data shape: {X_train.shape}")
    print(f"Train labels shape: {y_train.shape}")
    print(f"Test data shape: {X_test.shape}")
    print(f"Test labels shape: {y_test.shape}")
    print(f"Number of unique linked_items: {len(label_mapping)}")
    if 'item_title' in y_train.columns:
        print(f"Number of unique item_title (train): {y_train['item_title'].nunique()}")

    # Extract labels
    train_labels_linked = y_train['label'].astype(int).tolist()
    test_labels_linked = y_test['label'].astype(int).tolist()

    # Build item_title encoder for CoT
    if 'item_title' in y_train.columns:
        item_le = LabelEncoder()
        y_train_item_ids = item_le.fit_transform(y_train['item_title'].astype(str).fillna(''))
        # safe transform for test
        classes_set = set(item_le.classes_)
        y_test_item_ids = [int(item_le.transform([v])[0]) if str(v) in classes_set else 0 for v in y_test['item_title'].astype(str)]
        num_item_classes = len(item_le.classes_)
    else:
        # 若不存在 item_title，则退化为单任务（item 分支只有 1 类）
        y_train_item_ids = [0] * len(y_train)
        y_test_item_ids = [0] * len(y_test)
        num_item_classes = 1

    # Compose input texts: case_title + performed_work
    train_texts = (X_train.iloc[:, 0].astype(str) + ' [SEP] ' + X_train.iloc[:, 1].astype(str)).tolist()
    test_texts = (X_test.iloc[:, 0].astype(str) + ' [SEP] ' + X_test.iloc[:, 1].astype(str)).tolist()

    # Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load local tokenizer and build multitask model
    local_model_path = './models/google-bert/bert-base-chinese'
    if not os.path.isdir(local_model_path):
        raise FileNotFoundError(f"未找到本地模型目录: {local_model_path}")
    tokenizer = AutoTokenizer.from_pretrained(local_model_path, local_files_only=True)
    num_linked_classes = len(label_mapping)
    model = MultiTaskBert(local_model_path, num_labels_linked=num_linked_classes, num_labels_item=num_item_classes)
    model = model.to(device)

    # Datasets & loaders
    batch_size = 16
    max_length = 128
    train_dataset = CoTDataset(X_train, train_labels_linked, y_train_item_ids, tokenizer, max_length)
    test_dataset = CoTDataset(X_test, test_labels_linked, y_test_item_ids, tokenizer, max_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # Train
    print("\nTraining model (CoT)...")
    num_epochs = 5
    train_losses, val_losses, val_accuracies = train_model_cot(
        model, train_loader, test_loader, optimizer, num_epochs=num_epochs, device=device
    )

    # Plot curves
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.title('Training/Validation Loss'); plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(val_accuracies, label='Val Acc (linked)')
    plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.title('Validation Accuracy (linked)'); plt.legend()
    plt.tight_layout(); plt.savefig('bert_training_history_cot.png'); plt.show()

    # Evaluate
    print("\nEvaluating on test set...")
    acc_linked, acc_item, preds_linked, labels_linked = evaluate_model_cot(model, test_loader, device=device)

    # Save model and tokenizer
    save_path = 'bert_chinese_classifier_local_cot'
    os.makedirs(save_path, exist_ok=True)
    # 保存 backbone + heads
    try:
        # 仅保存 backbone 权重
        model.bert.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
    except Exception:
        pass

    print(f"\nModel backbone and tokenizer saved to '{save_path}'")

    # Confusion matrix for linked_items (top 20)
    from collections import Counter
    cm = confusion_matrix(labels_linked, preds_linked)
    top_classes = np.argsort(np.bincount(labels_linked))[-20:]
    cm_top = cm[np.ix_(top_classes, top_classes)]
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm_top, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix (Top 20 Classes, linked_items)')
    plt.xlabel('Predicted'); plt.ylabel('True')
    plt.savefig('bert_confusion_matrix_cot.png'); plt.show()

    print(f"Test Accuracy (linked_items): {acc_linked:.4f}")
    print(f"Test Accuracy (item_title):  {acc_item:.4f}")

    return model, acc_linked
