# D-MPNN training for Eint_H2O prediction

## 1. Import

In [None]:
import sys
from matplotlib import pyplot as plt
import pandas as pd

sys.path.append('.')

from mpnn_model import DataManager, MPNNTrainer, MPNNHyperparameterOptimizer
from chemprop import nn

print("All imports successful!")

## 2. Load data

In [None]:
df_input = pd.read_csv('dataset_Eint_H2O.csv', index_col=0)
print(f"Data loaded successfully! Shape: {df_input.shape}")
print("\nFirst few rows:")
print(df_input.head())

print("\nColumn names:")
print(df_input.columns.tolist())

## 3. Split data

In [None]:
TARGET_COLUMN = 'Eint_H2O'
SMILES_COLUMN = 'CanonSMILES'
CLASS_COLUMN = 'Class_label'

print("Initializing DataManager...")
data_manager = DataManager(
    df_input=df_input,
    target_label=TARGET_COLUMN,
    smiles_label=SMILES_COLUMN,
    class_label=CLASS_COLUMN,
    ratios=(0.8, 0.1, 0.1),  # 80% train, 10% val, 10% test
    n_splits_cv=10,    # 1 for final training, 10 for hyperparameter optimization
    random_state=123
)

print(f"Data splitting completed!")
print(f"Number of CV splits: {data_manager.n_splits_cv}")
print(f"Test set size: {len(data_manager.test_mol_ids)}")

if data_manager.cv_splits:
    train_ids, val_ids = data_manager.cv_splits[0]
    print(f"First fold - Train: {len(train_ids)}, Val: {len(val_ids)}")

    train_classes = df_input.loc[train_ids, CLASS_COLUMN].value_counts()
    val_classes = df_input.loc[val_ids, CLASS_COLUMN].value_counts()
    test_classes = df_input.loc[data_manager.test_mol_ids, CLASS_COLUMN].value_counts()
    
    print("\nClass distribution:")
    print("Train:", train_classes.to_dict())
    print("Val:", val_classes.to_dict())
    print("Test:", test_classes.to_dict())

## 4. Model training (for example)

In [None]:
print("Getting data for fold 0...")
train_dset, val_dset = data_manager.get_data_for_fold(0)

print(f"Train dataset size: {len(train_dset)}")
print(f"Validation dataset size: {len(val_dset)}")

model_config = {
    "batch_size": 16,
    "depth": 4,
    "message_hidden_dim": 300,
    "ffn_hidden_dim": 300,
    "ffn_num_layers": 3,
    "dropout": 0.2,
    "max_lr": 1e-3,
    "init_lr_ratio": 0.5,
    "final_init_lr_ratio": 0.001,
}

# 创建MPNNTrainer
print("\nInitializing MPNNTrainer...")
metrics = [nn.metrics.RMSEMetric(), nn.metrics.MAEMetric(), nn.metrics.R2Metric()]

trainer = MPNNTrainer(
    model_config=model_config,
    metrics=metrics,
    max_epochs=200,
    patience_early_stopping=50,
    enable_progress_bar=True
)

print("Starting training...")
val_loss = trainer.train(train_dset, val_dset)
print(f"Training completed! Final validation loss: {val_loss}")

## 5. Model evaluation

In [None]:
train_ids, val_ids = data_manager.cv_splits[0]
test_ids = data_manager.test_mol_ids
test_dset, _ = data_manager.get_test_data()

datasets_config = [
    {'dset': train_dset, 'name': 'Train', 'ids': train_ids},
    {'dset': val_dset, 'name': 'Validation', 'ids': val_ids},
    {'dset': test_dset, 'name': 'Test', 'ids': test_ids}
]

results = {}
dfs = {}

for config in datasets_config:
    name = config['name']
    dset = config['dset']
    ids = config['ids']
        
    print(f"\nEvaluating model on {name} set...")

    df_subset = data_manager.df_input.loc[ids]
    targets = df_subset[data_manager.target_label].values
    eval_results = trainer.evaluate(dset, targets)
    if not eval_results:
        continue
    
    results[name] = {
        'eval': eval_results,
        'residuals': eval_results["true_values"] - eval_results["predictions"]
    }
    
    dfs[name] = pd.DataFrame({
        'y_true': eval_results["true_values"],
        'y_pred': eval_results["predictions"]
    }, index=ids)

if dfs:
    with pd.ExcelWriter('evaluation_results.xlsx') as writer:
        for name, df in dfs.items():
            df.to_excel(writer, sheet_name=f'{name}set')
    print("\nSaved evaluation results to Excel file")


if results:
    plt.figure(figsize=(6, 6))
    colors = {'Train': 'blue', 'Validation': 'green', 'Test': 'orange'}
    all_true, all_pred = [], []
    
    for name, data in results.items():
        true_vals = data['eval']["true_values"]
        pred_vals = data['eval']["predictions"]
        plt.scatter(true_vals, pred_vals, alpha=0.6, 
                   label=f'{name} Data', color=colors[name])
        all_true.extend(true_vals)
        all_pred.extend(pred_vals)
    
    min_val = min(min(all_true), min(all_pred))
    max_val = max(max(all_true), max(all_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'gray', lw=2)
    
    plt.xlabel(f'True {TARGET_COLUMN}')
    plt.ylabel(f'Predicted {TARGET_COLUMN}')
    plt.title(f'Predictions vs True Values ({TARGET_COLUMN})')
    plt.legend()
    plt.grid(True, alpha=0.5)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.tight_layout()
    plt.show()
else:
    print("\nNo data available for plotting predictions")

## 5. Hyperparameter optimization

In [None]:
print("Starting hyperparameter optimization...")

hpo_optimizer = MPNNHyperparameterOptimizer(
    data_manager=data_manager,
    chemprop_metrics=[nn.metrics.RMSEMetric(), nn.metrics.MAEMetric()],
    num_samples_tune=10,
    max_epochs_per_trial=50,
    patience_per_trial=10,
    cpus_per_trial=10,
    gpus_per_trial=1
)
hpo_optimizer.run_optimization()