# 进一步进行模型的训练

In [2]:
# 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [9]:
# 设置常数
TEST_SIZE = 0.3
RANDOM_STATE = 42

In [4]:
# 导入数据
data_featurized = pd.read_csv('../data/data_training.csv')
data_featurized.drop(columns=['material_id', 'composition', 'crystal_system'], inplace=True)
X = data_featurized.drop(columns=['e_total', 'e_ionic', 'e_electronic'])
y_e_total = data_featurized['e_total']
y_e_ionic = data_featurized['e_ionic']
y_eletronic = data_featurized['e_electronic']
# 特征缩放
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 训练集测试集划分
X_train, X_test, y_train, y_test = train_test_split(X, y_e_total, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [5]:
# 加载数据
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data.to_numpy()
        self.targets = targets.to_numpy().reshape(-1, 1)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        # 根据索引获取数据和目标，并将它们转换为张量
        x = torch.tensor(self.data[index], dtype=torch.float32)
        y = torch.tensor(self.targets[index], dtype=torch.float32)
        return x, y
    
# 创建自定义数据集实例
dataset = CustomDataset(X_train, y_train)   # 注意X_train,y_train都为pandas.DataFrame
# 创建数据加载器
batch_size = 64
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [6]:
# 搭建神经网络
import torch.nn as nn
# 导入optim
import torch.optim as optim

class NN_model(nn.Module):
    def __init__(self):
        super(NN_model, self).__init__()
        
        # 使用Sequential定义神经网络的结构
        self.layers = nn.Sequential(
            nn.Linear(17, 9),
            nn.ReLU(),
            nn.Linear(9, 5),
            nn.ReLU(),
            nn.Linear(5, 1)
        )
        
    def forward(self, x):
        # 前向传播
        return self.layers(x)
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# 定义超参数
input_size = 17
output_size = 1
learning_rate = 0.0013
num_epochs = 10000

# 初始化模型、损失函数和优化器
# 从pth文件读取已有模型
model = NN_model()
# model.load_state_dict(torch.load('NN_model.pth'))
# 使用均方误差作为损失函数
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

model.to(device)
criterion.to(device)
# 训练模型
model.train()
for epoch in range(num_epochs):
    # Forward pass
    epoch_loss = 0
    for i, (features, targets) in enumerate(data_loader):
        features = features.to(device)
        targets = targets.to(device)
        outputs = model(features)
        loss = criterion(outputs, targets)
        epoch_loss += loss.item()
        # Backward and optimize
        optimizer.zero_grad()    # 梯度重置
        loss.backward() # 向后传播计算梯度
        optimizer.step() # 通过梯度更新参数

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, MSE: {epoch_loss/len(dataset):.4f}')

# 保存模型
torch.save(model.state_dict(), 'NN_model.pth')

Epoch [100/10000], Loss: 308175454.9398, MSE: 52942.0125


KeyboardInterrupt: 