In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import csv
from tqdm import tqdm
import numpy as np


def getData(inputFile):
    contents = []
    labels = []
    with open(inputFile, newline='', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader)
        for row in csv_reader:
            _, content, label = row
            if len(content.strip()) == 0:
                continue
            contents.append(content)
            labels.append(int(label))
    return labels, contents

# Dataset class
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Preparing the data
data_path = './DataSet_shuffle.csv'
total_labels, total_contents = getData(data_path)


train_texts, val_texts, train_labels, val_labels = train_test_split(total_contents, total_labels, test_size=0.1, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
val_dataset = SpamDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)


optimizer = AdamW(model.parameters(), lr=2e-5)

# Training function
def train(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
            total_loss += loss.item()
    acc = accuracy_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)
    return acc, avg_loss


# Starting  training
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train_loss = train(model, train_loader, optimizer)
    val_acc, val_loss = evaluate(model, val_loader)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

# Saving model
torch.save(model.state_dict(), "bert_spam_classifier.pth")
print("Model saved successfully!")




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 931/931 [05:06<00:00,  3.04it/s]
Evaluating: 100%|██████████| 52/52 [00:10<00:00,  4.87it/s]


Train Loss: 0.0735, Validation Loss: 0.0287, Validation Accuracy: 0.9903

Epoch 2/3


Training: 100%|██████████| 931/931 [05:16<00:00,  2.94it/s]
Evaluating: 100%|██████████| 52/52 [00:10<00:00,  4.83it/s]


Train Loss: 0.0164, Validation Loss: 0.0301, Validation Accuracy: 0.9909

Epoch 3/3


Training: 100%|██████████| 931/931 [05:18<00:00,  2.92it/s]
Evaluating: 100%|██████████| 52/52 [00:10<00:00,  4.75it/s]


Train Loss: 0.0076, Validation Loss: 0.0276, Validation Accuracy: 0.9918
Model saved successfully!


In [8]:
import time

# 1. Loading model structure and weights
device = torch.device('cpu')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load("bert_spam_classifier.pth", map_location=torch.device('cpu'))) # 加载训练时保存的模型参数（权重）。 即使训练时用了 GPU，这样设置也能在 CPU 上加载，避免加载错误， 因为有的设备没有GPU。 这行代码只是把模型权重参数 映射到 CPU，并不会决定模型之后在哪个设备上执行前向传播（推理）
model.eval() # Switch the model to evaluation mode
model.to(device)

# 2. Defining the test function
def predict(text, model, tokenizer, max_len=64):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=max_len) # 把输入文本转换为 BERT 所需的编码格式。 "pt"：返回 PyTorch 的 Tensor 格式。  truncation=True: 超过最大长度的文本会被截断。 padding="max_length": 不足最大长度的会自动填充。 max_length=max_len: 设置最大 token 序列长度
    inputs = {k: v.to(device) for k, v in inputs.items()} # 把 input_ids 和 attention_mask 这两个 tensor 放到 CPU/GPU 上，方便模型处理

    start_time = time.time()  # Starting time

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        print(f"经过softmax后的probs: {probs}")
        pred_label = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_label].item()

    end_time = time.time()  # End Time
    elapsed_time = end_time - start_time
    print(f"Time: {elapsed_time:.4f} s")

    return pred_label, confidence


# 3. Start Testing
test_texts = ["Hello, dear customer!Our store is having a big weekend promotion. If you spend over 1,000 US dollars, you can enjoy a 50% discount and receive many exquisite gifts.Additionally, there will be even bigger discounts at the end of this month. If you are interested, please call 010-5555-7777.We look forward to your visit"]

for text in test_texts:
    label, prob = predict(text, model, tokenizer)
    print(f"Text: {text}\n=> Predicted label: {'Spam' if label == 1 else 'Ham'} (Confidence: {prob:.2f})\n")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


经过softmax后的probs: tensor([[1.8160e-04, 9.9982e-01]])
Time: 0.2274 s
Text: Hello, dear customer!Our store is having a big weekend promotion. If you spend over 1,000 US dollars, you can enjoy a 50% discount and receive many exquisite gifts.Additionally, there will be even bigger discounts at the end of this month. If you are interested, please call 010-5555-7777.We look forward to your visit
=> Predicted label: Spam (Confidence: 1.00)

