# 垃圾邮件分类实验笔记本

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 配置GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# 数据加载与预处理
def load_spam_dataset(path='../data/raw/emails.csv'):
    df = pd.read_csv(path)
    X = df['text'].values
    y = df['label'].values
    return X, y

X, y = load_spam_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 模型实验
class SpamClassificationExperiment:
    def __init__(self, model_name='distilbert-base-uncased'):
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.models = {
            'DistilBERT': DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2),
            'LSTM': self._create_lstm_model()
        }
    
    def _create_lstm_model(self):
        return nn.Sequential(
            nn.Embedding(10000, 128),
            nn.LSTM(128, 64, batch_first=True),
            nn.Linear(64, 2),
            nn.Softmax(dim=1)
        )
    
    def tokenize_data(self, texts):
        return self.tokenizer(list(texts), padding=True, truncation=True, return_tensors='pt')
    
    def train_model(self, model_name, X_train, y_train, epochs=3):
        model = self.models[model_name].to(device)
        optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
        criterion = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            inputs = self.tokenize_data(X_train).to(device)
            labels = torch.tensor(y_train).to(device)
            
            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
        
        return model
    
    def evaluate_model(self, model, X_test, y_test):
        model.eval()
        inputs = self.tokenize_data(X_test).to(device)
        labels = torch.tensor(y_test).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
        
        return classification_report(labels.cpu(), predictions.cpu())

In [None]:
# 模型对比实验
experiment = SpamClassificationExperiment()

results = {}
for model_name in ['DistilBERT', 'LSTM']:
    trained_model = experiment.train_model(model_name, X_train, y_train)
    results[model_name] = experiment.evaluate_model(trained_model, X_test, y_test)

# 打印结果
for model_name, report in results.items():
    print(f"{model_name} 模型性能报告:\n{report}\n")