In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import os
import ast
import matplotlib.pyplot as plt

############### LOCAL IMPORTS ###############
from network_classes import MLP 
from stats_and_plot_functions import mean_squared_error, calc_rhoc, plot_loss_and_r, CustomScaler


torch.manual_seed(42) # 42 for reproducibility
num=100 # Use this for testing and debugging
#num=None set to this when you want it to load
cwd = os.getcwd()
data_train = pd.read_csv(cwd+'/data/train_data_10k_conjugated_cores.csv')[:num]
data_test = pd.read_csv(cwd+'/data/test_data_500_analogues.csv') 

print("Formatting data - this may take a moment")
# Training data
train_Etddft = data_train['S1_TDDFT'].values 
train_Ezindo = data_train['S1_ZINDO'].values
train_fp = np.array([np.array(ast.literal_eval(fp), dtype=np.int32) for fp in data_train['fingerprint'].values])
net_input= np.concatenate((train_Ezindo.reshape(-1,1), train_fp),axis=1)

# Testing data (NOT VALIDATION DATA, use this once you model has been fully trained)
test_Etddft = data_test['E(S1_TDDFT)_analogue'].values 
test_Ezindo = data_test['E(S1_ZINDO)_analogue'].values
test_fp = np.array([np.array(ast.literal_eval(fp), dtype=np.int32) for fp in data_test['fingerprint'].values])


# Initialise model
criterion = nn.MSELoss()  

Formatting data - this may take a moment


In [None]:
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 29 19:44:39 2025

Author: Adam Coxson, PhD student, University of Liverpool
Department of Chemistry, Materials Innovation Factory, Levershulme Research Centre
Module: network_models.py
Local dependencies: network_classes.py, stats_and_plot_functions.py
For the dML workshop Mar 2025

# https://medium.com/@shashankshankar10/introduction-to-neural-networks-build-a-single-layer-perceptron-in-pytorch-c22d9b412ccf
# Numpy only perceptron: https://sebastianraschka.com/Articles/2015_singlelayer_neurons.html
# https://www.kaggle.com/code/pinocookie/pytorch-simple-mlp

Trains a neural network on 1D regression data and visualises the results.
"""

############## PACKAGE IMPORTS ###############
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

############### LOCAL IMPORTS ###############
from network_classes import SingleLayerPerceptron, MLP_2layer, MLP 
from stats_and_plot_functions import compute_y, sample_x_values, normalize_data, mean_squared_error, plot_train_loss, doubleplot, calc_num_net_parameters


torch.manual_seed(42) # 42 for reproducibility

# Uncomment to show the True function of ((0.4*x + 0.5*np.sin(5*x) + np.sin(3*x)) + 10*np.cos(x)*np.exp(-0.1*x)) + 7
x_min, x_max, dx = 0, 30, 0.001
num_samples = 50
x_values = np.linspace(x_min, x_max, 300)
y_values = compute_y(x_values)
# Randomly sample x values and compute corresponding y values
sampled_x = sample_x_values(x_min, x_max, dx, num_samples)
sampled_y = compute_y(sampled_x)
# doubleplot(x_values, y_values, sampled_x, sampled_y,labels=["x","y","True function","50 Random 'training' points",""],lims=[[0,30],[0,20]])
# exit()

# Hyperparameters
input_dim = 1 # We only have one input dimension (the scalar x value)
output_dim = 1  # We only have one output dimension (the scalar y value)
learning_rate = 0.001
num_epochs = 200
batch_size = 32
neurons=[100,50,20,10]
#neurons=5000
#neurons=[100, 50]
# See list of different activation functions https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity
activation = [nn.SELU(), nn.ReLU(), nn.Sigmoid()][1] 
dropout=0.0  # Careful with this as it may currently be broken



# Initialize model
#model = SingleLayerPerceptron(input_dim=1, hidden_dim=neurons, output_dim=1,activation_func=activation)
#model = MLP_2layer(input_dim=1,output_dim=1,neurons=neurons,activation_func=activation)
layers=[] # Format layers for variable MLP class
for n in neurons:
    layers.append((n, activation))
model = MLP(input_dim,output_dim,layers,dropout_prob=dropout)

if type(neurons) is list:
    nparams= calc_num_net_parameters(neurons.copy(), output_size=1)
else:
    nparams= calc_num_net_parameters(neurons, output_size=1)

# Training data params
training_samples=20000 
x_min, x_max=0,30

# Create and preprocess training data into 32 bit floats for PyTorch torch.Tensor objects
x_vals_train=sample_x_values(x_min, x_max, dx=0.00001, num_samples=training_samples).astype(np.float32)
y_vals_train = compute_y(x_vals_train).astype(np.float32)
x_vals_train_normalized, x_mean, x_std = normalize_data(x_vals_train)
y_vals_train_normalized, y_mean, y_std = normalize_data(y_vals_train)
X_train = torch.tensor(x_vals_train_normalized.reshape(len(x_vals_train),1), device='cpu')
y_train = torch.tensor(y_vals_train_normalized.reshape(len(y_vals_train),1), device='cpu')

# Create Validation data. In this simple case we are just sampling the true function, but realisitically it is taken from
# shuffled training data, i.e. for k-fold cross validation.
x_vals_valid=np.arange(x_min, x_max + 0.01, 0.1).astype(np.float32)
y_vals_valid = compute_y(x_vals_valid).astype(np.float32)
x_vals_valid_normalised = (x_vals_valid - x_mean) / x_std # DELIBERATELY Using normalisation from TRAINING data
y_vals_valid_normalised = (y_vals_valid - y_mean) / y_std # DELIBERATELY Using normalisation from TRAINING data
X_valid = torch.tensor(x_vals_valid_normalised.reshape(len(x_vals_valid),1), device='cpu')

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
criterion = nn.MSELoss()  
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop with validation tracking
train_losses = []
valid_losses = []
best_valid_loss, best_train_loss, best_epoch = float("inf"), float("inf"), 0
best_model_state = None  # To store the best model state dict
for epoch in range(num_epochs):
    total_loss = 0.0
    model.train() # Tell model we are in training mode, weights can be modified, later we switch to evaluation mode
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        batch_X, batch_y = batch_X.float(), batch_y.float()
        outputs = model(batch_X).squeeze()  # Ensure output shape matches target
        loss = criterion(outputs, batch_y.squeeze())  # Compute loss
        loss.backward()  # Apply backpropagation
        optimizer.step()  # Update weights
        total_loss += loss.item()
    
    model.eval() # Important to tell the model we're in evaluation model, ensures no weights are changed by accident
    with torch.no_grad():
        valid_pred_normalized = np.squeeze(model(X_valid).detach().numpy()) # Predict validation data
    train_loss = total_loss / len(train_loader)
    valid_loss = mean_squared_error(y_vals_valid_normalised, valid_pred_normalized)  # Loss of true norm vs pred norm validation data
    train_losses.append(train_loss), valid_losses.append(valid_loss)

    # Save the best model based on validation loss
    if valid_loss < best_valid_loss:
        best_epoch = epoch
        best_valid_loss, best_train_loss = valid_loss, train_loss
        best_model_state = model.state_dict().copy()  # Save model state, not the whole model object

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.5f}, Valid Loss: {valid_loss:.5f}")
print("\nTraining complete.\n")
print("Number of training samples:",training_samples)
print("Neurons:",neurons,"("+str(nparams)+" parameters)","\nNum Epochs:", num_epochs,
      "\nLearning Rate:",learning_rate,"\nBatch Size:",batch_size,"\nActivation:",activation)
# Restore the best model before evaluation
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"Best model from Epoch {best_epoch+1} restored with Training Loss: {best_train_loss:.5f} and Validation Loss: {best_valid_loss:.5f}")     
plot_train_loss(train_losses, valid_losses, labels=["Epoch", "MSE Loss", "Training and Validation Loss"])

# Final evaluation using the best model
model.eval() # Important to tell the model we're in evaluation model, ensures no weights are changed by accident
with torch.no_grad():
    valid_pred_normalized = np.squeeze(model(X_valid).detach().numpy())
y_vals_valid_predicted = (valid_pred_normalized * y_std) + y_mean # Unnormalise 

"""
####################### VISUALISE AND COMPARE THE TRUE AND PREDICTED DATA #######################

How good was the network at predicting the true function? Try different hyperparamters and samplings of the data.
"""
doubleplot(x_vals_valid, y_vals_valid, x_vals_valid, y_vals_valid_predicted, labels=["x","y","True validation data","Pred validation data",""])

    
    
    


In [None]:
# -*- coding: utf-8 -*-
"""
Modified: Complete training pipeline with MLP class compatibility
"""

############## PACKAGE IMPORTS ###############
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import os
import ast
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

############### LOCAL IMPORTS ###############
from network_classes import MLP 
from stats_and_plot_functions import mean_squared_error, calc_rhoc, CustomScaler

# 固定随机种子保证可复现性
torch.manual_seed(42)
np.random.seed(42)

# 超参数配置
HIDDEN_LAYERS = [
    (512, nn.ReLU()),  # 增加神经元数量
    (256, nn.ReLU()),
    (128, nn.ReLU())    # 增加隐藏层数
]
LEARNING_RATE = 0.001
BATCH_SIZE = 256
EPOCHS = 500
VALID_RATIO = 0.2

# 加载数据
cwd = os.getcwd()
data_train = pd.read_csv(cwd+'/data/train_data_10k_conjugated_cores.csv')
data_test = pd.read_csv(cwd+'/data/test_data_500_analogues.csv') 

# 数据预处理函数
def prepare_data(input_data, target_data, scaler=None, fit=False):
    """数据标准化处理"""
    if fit:
        scaler = CustomScaler()
        scaler.fit(input_data)
    input_scaled = scaler.transform(input_data)
    return torch.FloatTensor(input_scaled), torch.FloatTensor(target_data), scaler

# 提取训练数据
train_Etddft = data_train['S1_TDDFT'].values 
train_Ezindo = data_train['S1_ZINDO'].values
train_fp = np.array([np.array(ast.literal_eval(fp), dtype=np.int32) for fp in data_train['fingerprint'].values])
train_input = np.concatenate((train_Ezindo.reshape(-1,1), train_fp), axis=1)

# 划分训练集/验证集
train_input_split, val_input_split, train_target_split, val_target_split = train_test_split(
    train_input, train_Etddft, test_size=VALID_RATIO, random_state=42
)

# 标准化（仅用训练集拟合）
train_input_norm, train_target_norm, scaler = prepare_data(train_input_split, train_target_split, fit=True)
val_input_norm, val_target_norm, _ = prepare_data(val_input_split, val_target_split, scaler=scaler)

# 转换为PyTorch DataLoader
train_dataset = torch.utils.data.TensorDataset(train_input_norm, train_target_norm)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# 初始化模型（参数严格匹配MLP类定义）
model = MLP(
    input_dim    = train_input.shape[1],  # 输入特征维度 = ZINDO能量(1) + 指纹长度
    output_dim   = 1,                     # 输出TDDFT能量
    layers_data  = HIDDEN_LAYERS,         # 隐藏层配置
    dropout_prob = 0.0                    # 无dropout
)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()

# 训练循环
train_losses, val_losses = [], []
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets.view(-1,1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))
    
    # 验证阶段
    model.eval()
    with torch.no_grad():
        val_pred = model(val_input_norm)
        val_loss = criterion(val_pred, val_target_norm.view(-1,1)).item()
    val_losses.append(val_loss)
    
    print(f"Epoch {epoch+1:03d}/{EPOCHS} | Train Loss: {train_losses[-1]:.4f} | Val Loss: {val_losses[-1]:.4f}")

# 测试集处理
test_Etddft = data_test['E(S1_TDDFT)_analogue'].values 
test_Ezindo = data_test['E(S1_ZINDO)_analogue'].values
test_fp = np.array([np.array(ast.literal_eval(fp), dtype=np.int32) for fp in data_test['fingerprint'].values])
test_input = np.concatenate((test_Ezindo.reshape(-1,1), test_fp), axis=1)
test_input_norm, _, _ = prepare_data(test_input, test_Etddft, scaler=scaler)
test_input_tensor = torch.FloatTensor(test_input_norm)

# 预测与评估
model.eval()
with torch.no_grad():
    test_pred = model(test_input_tensor).numpy().flatten()
test_pred_actual = scaler.inverse_target(test_pred.reshape(-1,1)).flatten() if hasattr(scaler, 'inverse_target') else test_pred

# 计算指标
test_mse = mean_squared_error(test_Etddft, test_pred_actual)
test_rho = calc_rhoc(test_Etddft, test_pred_actual)

# 可视化结果
plt.figure(figsize=(12,5))
plt.subplot(121)
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.title("Training/Validation Loss Curve")
plt.xlabel("Epoch"), plt.ylabel("MSE"), plt.legend()

plt.subplot(122)
plt.scatter(test_Etddft, test_pred_actual, alpha=0.5)
plt.plot([min(test_Etddft), max(test_Etddft)], [min(test_Etddft), max(test_Etddft)], 'r--')
plt.title(f"TDDFT True vs Pred (ρ={test_rho:.3f})")
plt.xlabel("True Values"), plt.ylabel("Predictions")
plt.tight_layout()
plt.savefig("delta_ml_results.png")
plt.show()

print(f"\nFinal Results:")
print(f"Test MSE: {test_mse:.5f}")
print(f"Test Correlation: {test_rho:.5f}")