In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# 设置随机种子以确保可重复性
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [None]:
# 创建一个简单的神经网络模型
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        out = self.layer1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.layer2(out)
        return out

# 创建示例数据集
def create_sample_data():
    X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 标准化特征
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # 转换为PyTorch张量
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)

    return X_train, X_test, y_train, y_test

In [None]:
# 训练函数
def train_model(model, criterion, optimizer, X_train, y_train, epochs=50):
    model.train()
    losses = []

    for epoch in range(epochs):
        # 前向传播
        outputs = model(X_train)
        loss = criterion(outputs, y_train)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

    return losses

# 评估函数
def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        outputs = model(X_test)
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == y_test).sum().item() / y_test.size(0)
    return accuracy

In [None]:
# 超参数
input_size = 20
hidden_size = 50
num_classes = 2
learning_rate = 0.01
epochs = 100

In [None]:
# 创建数据
X_train, X_test, y_train, y_test = create_sample_data()

# 初始化模型、损失函数和优化器
model = SimpleNN(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# 训练模型
print("开始训练模型...")
losses = train_model(model, criterion, optimizer, X_train, y_train, epochs)

In [None]:
# 评估模型
accuracy = evaluate_model(model, X_test, y_test)
print(f'训练后模型准确率: {accuracy:.2%}')

# 保存示例

In [None]:
# 创建模型保存目录
model_dir = Path("saved_models")
model_dir.mkdir(exist_ok=True)

# 1. 保存整个模型（不推荐用于生产环境）
entire_model_path = model_dir / "entire_model.pth"
torch.save(model, entire_model_path)
print(f"已保存整个模型到: {entire_model_path}")

# 2. 保存模型状态字典（推荐方式）
state_dict_path = model_dir / "model_state_dict.pth"
torch.save(model.state_dict(), state_dict_path)
print(f"已保存模型状态字典到: {state_dict_path}")

# 3. 保存检查点（包含额外信息，适用于恢复训练）
checkpoint_path = model_dir / "checkpoint.pth"
checkpoint = {
    'epoch': epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': losses[-1],
    'accuracy': accuracy,
}
torch.save(checkpoint, checkpoint_path)
print(f"已保存检查点到: {checkpoint_path}")

# 加载示例

In [None]:
# 加载模型的几种方式

# 1. 加载整个模型
print("\\n加载整个模型...")
loaded_entire_model = torch.load(entire_model_path)
loaded_entire_model.eval()
accuracy = evaluate_model(loaded_entire_model, X_test, y_test)
print(f'加载整个模型后的准确率: {accuracy:.2%}')

# 2. 加载状态字典到新模型实例
print("\\n加载状态字典到新模型实例...")
new_model = SimpleNN(input_size, hidden_size, num_classes)
new_model.load_state_dict(torch.load(state_dict_path))
new_model.eval()
accuracy = evaluate_model(new_model, X_test, y_test)
print(f'加载状态字典后的准确率: {accuracy:.2%}')

# 3. 加载检查点并恢复训练状态
print("\\n加载检查点并恢复训练状态...")
resumed_model = SimpleNN(input_size, hidden_size, num_classes)
resumed_optimizer = optim.Adam(resumed_model.parameters(), lr=learning_rate)

checkpoint = torch.load(checkpoint_path)
resumed_model.load_state_dict(checkpoint['model_state_dict'])
resumed_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
resumed_epoch = checkpoint['epoch']
loss = checkpoint['loss']
accuracy = checkpoint['accuracy']

print(f"恢复训练:  epoch={resumed_epoch}, loss={loss:.4f}, accuracy={accuracy:.2%}")

# 多GPU训练模型的保存和加载（如果可用）

In [None]:


if torch.cuda.device_count() > 1:
    print("\\n多GPU训练示例...")
    # 使用DataParallel包装模型
    parallel_model = nn.DataParallel(model)
    # 训练... (此处省略)

    # 保存多GPU模型
    parallel_model_path = model_dir / "parallel_model.pth"
    torch.save(parallel_model.state_dict(), parallel_model_path)
    print(f"已保存多GPU模型到: {parallel_model_path}")

    # 加载多GPU模型到单GPU环境
    # 方法1: 通过创建一个没有模块前缀的新模型
    state_dict = torch.load(parallel_model_path)
    # 移除'module.'前缀
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:] if k.startswith('module.') else k  # 移除'module.'前缀
        new_state_dict[name] = v

    single_gpu_model = SimpleNN(input_size, hidden_size, num_classes)
    single_gpu_model.load_state_dict(new_state_dict)
    print("多GPU模型已成功加载到单GPU环境")

# 绘制训练损失曲线
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title("训练损失曲线")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.savefig(model_dir / "training_loss.png")
print(f"训练损失曲线已保存到: {model_dir / 'training_loss.png'}")
