#transformer解决灾难推文的自然语言处理
source : https://www.kaggle.com/competitions/nlp-getting-started/data?select=train.csv

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from datasets import Dataset
from transformers import DataCollatorWithPadding,BertTokenizer,BertForSequenceClassification,get_scheduler,DistilBertForSequenceClassification
from torch.utils.data import DataLoader
import pandas as pd
from torch.optim import AdamW
from tqdm.auto import tqdm
import torch.nn.utils.prune as prune
from sklearn.metrics import accuracy_score, f1_score

In [2]:
def preprocess_text_with_features(df):
    """将keyword和location拼接到text中"""
    processed_texts = []
    for idx, row in df.iterrows():
        text = row['text']
        keyword = row['keyword'] if pd.notna(row['keyword']) else ""
        location = row['location'] if pd.notna(row['location']) else ""

        # 拼接格式：[keyword] [location] text
        combined_text = f"{keyword} {location} {text}".strip()
        processed_texts.append(combined_text)

    return processed_texts

def complete_data_processing():
    """数据预处理流程"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 读取数据
    data = pd.read_csv('train.csv').drop('id', axis=1)
    test = pd.read_csv('test.csv')
    test_id = test['id']

    # 数据集拆分
    data_train, data_eval = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

    # 文本预处理
    train_texts = preprocess_text_with_features(data_train)
    eval_texts = preprocess_text_with_features(data_eval)
    test_texts = preprocess_text_with_features(test)

    # BERT分词
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_tokens = tokenizer(train_texts, padding=True, truncation=True, max_length=512)
    eval_tokens = tokenizer(eval_texts, padding=True, truncation=True, max_length=512)
    test_tokens = tokenizer(test_texts, padding=True, truncation=True, max_length=512)

    # 创建Dataset
    train_dataset = Dataset.from_dict({
        'input_ids': train_tokens['input_ids'],
        'attention_mask': train_tokens['attention_mask'],
        'labels': data_train['target'].tolist()
    })

    eval_dataset = Dataset.from_dict({
        'input_ids': eval_tokens['input_ids'],
        'attention_mask': eval_tokens['attention_mask'],
        'labels': data_eval['target'].tolist()
    })

    test_dataset = Dataset.from_dict({
        'input_ids': test_tokens['input_ids'],
        'attention_mask': test_tokens['attention_mask']
    })

    print("数据处理完成！")
    print(f"训练集: {len(train_dataset)} 样本")
    print(f"验证集: {len(eval_dataset)} 样本")
    print(f"测试集: {len(test_dataset)} 样本")

    return train_dataset, eval_dataset, test_dataset, test_id

# 使用示例
if __name__ == '__main__':
    train_dataset, eval_dataset, test_dataset, test_id = complete_data_processing()

    # 检查第一个样本
    print("\n第一个训练样本:")
    print(f"input_ids长度: {len(train_dataset[0]['input_ids'])}")
    print(f"标签: {train_dataset[0]['labels']}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

数据处理完成！
训练集: 6090 样本
验证集: 1523 样本
测试集: 3263 样本

第一个训练样本:
input_ids长度: 93
标签: 1


In [3]:
# 批量化处理
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator)
eval_dataloader = DataLoader(eval_dataset, batch_size=32, collate_fn=data_collator)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=data_collator)

print(f"训练批次数: {len(train_dataloader)}")
print(f"验证批次数: {len(eval_dataloader)}")
print(f"测试批次数: {len(test_dataloader)}")

训练批次数: 191
验证批次数: 48
测试批次数: 102


In [4]:
#模型构建
#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2) #蒸馏版


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
#训练和评估逻辑
def train_model(model, train_dataloader, eval_dataloader, epochs=3, lr=5e-5, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = epochs * len(train_dataloader)

    # 添加学习率调度器
    scheduler = get_scheduler("linear", optimizer=optimizer,
                             num_warmup_steps=0, num_training_steps=num_training_steps)

    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()  # 添加学习率调度
            optimizer.zero_grad()
            progress_bar.update(1)

        print(f"Epoch {epoch+1}/{epochs} finished.")
        evaluate_model(model, eval_dataloader, device)

def evaluate_model(model, dataloader, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        all_preds.extend(predictions.cpu().numpy())
        if 'labels' in batch:
            all_labels.extend(batch['labels'].cpu().numpy())

    if all_labels:
        accuracy = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)
        print(f"Evaluation Accuracy: {accuracy:.4f}")
        print(f"Evaluation F1 Score: {f1:.4f}")
        return all_preds  # 修正：确保返回预测结果
    else:
        print("Evaluation completed without ground truth labels.")
        return all_preds

if __name__ == '__main__':
    print("\nStarting Training...")
    train_model(model, train_dataloader, eval_dataloader, epochs=3)

    print("\nEvaluating on Test Set...")
    test_predictions = evaluate_model(model, test_dataloader)

    # 提交文件
    import pandas as pd
    submission = pd.DataFrame({'id': test_id, 'target': test_predictions})
    submission.to_csv('submission.csv', index=False)
    print("Submission saved!")


Starting Training...


  0%|          | 0/573 [00:00<?, ?it/s]

Epoch 1/3 finished.
Evaluation Accuracy: 0.8477
Evaluation F1 Score: 0.8007
Epoch 2/3 finished.
Evaluation Accuracy: 0.8306
Evaluation F1 Score: 0.7978
Epoch 3/3 finished.
Evaluation Accuracy: 0.8319
Evaluation F1 Score: 0.7932

Evaluating on Test Set...
Evaluation completed without ground truth labels.
Submission saved!


In [10]:
#模型信息获取
def get_model_info(model):
   total_params = sum(p.numel() for p in model.parameters())
   model_size_mb = sum(p.numel() * p.element_size() for p in model.parameters()) / 1024 / 1024

   print(f"参数量: {total_params:,}")
   print(f"模型大小: {model_size_mb:.1f} MB")


get_model_info(model)

参数量: 66,955,010
模型大小: 255.4 MB


In [7]:
# 注意力模型剪枝微调比较麻烦，直接量化
quantized_model = torch.quantization.quantize_dynamic(
    model.cpu(), {torch.nn.Linear}, dtype=torch.qint8
)

def quantize_and_check_accuracy():
    """量化并检查准确率"""

    # 量化模型
    quantized_model = torch.quantization.quantize_dynamic(
        model.cpu(), {torch.nn.Linear}, dtype=torch.qint8
    )

    # 评估量化后准确率
    quantized_model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in eval_dataloader:
            batch = {k: v.cpu() for k, v in batch.items()}
            outputs = quantized_model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)

            all_preds.extend(predictions.numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    # 计算准确率
    from sklearn.metrics import accuracy_score, f1_score
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f"量化后准确率: {accuracy:.4f}")
    print(f"量化后F1分数: {f1:.4f}")

    return quantized_model

# 使用
quantized_model = quantize_and_check_accuracy()

量化后准确率: 0.8398
量化后F1分数: 0.8013


In [11]:
get_model_info(quantized_model)

参数量: 23,854,080
模型大小: 91.0 MB


In [8]:
def evaluate_quantized_cpu(model, dataloader):
    """专门评估量化模型"""
    # 确保模型完全在CPU上
    model = model.cpu()
    model.eval()

    all_preds = []

    with torch.no_grad():
        for batch in dataloader:
            # 确保所有输入都在CPU上
            batch = {k: v.cpu() for k, v in batch.items()}

            try:
                outputs = model(**batch)
                predictions = torch.argmax(outputs.logits, dim=-1)
                all_preds.extend(predictions.numpy())
            except Exception as e:
                print(f"批次处理出错: {e}")
                continue

    return all_preds

# 使用前确保量化模型完全在CPU
quantized_model = quantized_model.cpu()
for param in quantized_model.parameters():
    param.data = param.data.cpu()

# 再评估
test_predictions = evaluate_quantized_cpu(quantized_model, test_dataloader)

In [9]:
# 提交文件
submission2 = pd.DataFrame({'id': test_id, 'target': test_predictions})
submission2.to_csv('submission2.csv', index=False)
print("Submission saved!")

Submission saved!
