In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.preprocessing import StandardScaler
import pickle
import os
import shutil

from rdkit import Chem
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*')

import torch
import torch.nn.functional as F
from torch.nn import Linear, BatchNorm1d, Module, Sequential
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool


In [None]:
print("--- Step 1: Defining the SMILES to Graph Conversion (with Scaling and Correct Paths) ---")

def smiles_to_graph(smiles_string, y_val=0):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None: return None
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features_list.append([
            atom.GetAtomicNum(), atom.GetFormalCharge(), atom.GetHybridization(),
            atom.GetIsAromatic(), atom.GetTotalNumHs(), atom.GetTotalValence(),
        ])
    x = torch.tensor(atom_features_list, dtype=torch.float)
    edge_indices, edge_attrs = [], []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        bond_type = bond.GetBondTypeAsDouble()
        edge_indices.extend([(i, j), (j, i)])
        edge_attrs.extend([[bond_type], [bond_type]])
    edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=torch.tensor([y_val], dtype=torch.float))
    return data


--- Step 1: Defining the SMILES to Graph Conversion (with Scaling and Correct Paths) ---


In [None]:
class MeltingPointDataset(Dataset):
    def __init__(self, root, filename, original_data_path, test=False, scaler=None):
        self.filename = filename
        self.original_data_path = original_data_path
        self.test = test
        self.scaler = scaler
        super(MeltingPointDataset, self).__init__(root)
        
    @property
    def raw_file_names(self):
        return self.filename

    @property
    def processed_file_names(self):
        return f'{"test" if self.test else "train"}_data.pt'

    def download(self):
        source_path = os.path.join(self.original_data_path, self.filename)
        dest_path = os.path.join(self.raw_dir, self.filename)
        if not os.path.exists(dest_path):
            print(f"Copying {source_path} to {dest_path}")
            shutil.copy(source_path, dest_path)

    def process(self):
        self.data = pd.read_csv(self.raw_paths[0])
        
        if not self.test and self.scaler is None:
            all_node_features = []
            for smiles in tqdm(self.data['SMILES'], desc="Fitting Scaler"):
                mol = Chem.MolFromSmiles(smiles)
                if mol:
                    for atom in mol.GetAtoms():
                        all_node_features.append([
                            atom.GetAtomicNum(), atom.GetFormalCharge(), atom.GetHybridization(),
                            atom.GetIsAromatic(), atom.GetTotalNumHs(), atom.GetTotalValence(),
                        ])
            self.scaler = StandardScaler()
            self.scaler.fit(all_node_features)
            os.makedirs('../models', exist_ok=True)
            with open('../models/gnn_scaler.pkl', 'wb') as f:
                pickle.dump(self.scaler, f)

        graphs = []
        for idx, row in tqdm(self.data.iterrows(), total=self.data.shape[0], desc="Processing SMILES"):
            y_val = np.log(row['Tm']) if not self.test else 0
            graph = smiles_to_graph(row['SMILES'], y_val)
            if graph is not None:
                graph.x = torch.tensor(self.scaler.transform(graph.x), dtype=torch.float)
                graphs.append(graph)
        
        torch.save(graphs, self.processed_paths[0])

    def len(self):
        if not hasattr(self, 'graphs'):
             self.graphs = torch.load(self.processed_paths[0], weights_only=False)
        return len(self.graphs)

    def get(self, idx):
        if not hasattr(self, 'graphs'):
             self.graphs = torch.load(self.processed_paths[0], weights_only=False)
        return self.graphs[idx]


In [None]:
print("Instantiating and processing datasets...")
ORIGINAL_RAW_PATH = '../data/raw'

train_dataset = MeltingPointDataset(
    root='../data/processed/gnn_v4', 
    filename='train.csv', 
    original_data_path=ORIGINAL_RAW_PATH
)
with open('../models/gnn_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
test_dataset = MeltingPointDataset(
    root='../data/processed/gnn_v4', 
    filename='test.csv', 
    original_data_path=ORIGINAL_RAW_PATH,
    test=True, 
    scaler=scaler
)
print(f"Datasets created. Number of training graphs: {len(train_dataset)}")
print(f"Number of testing graphs: {len(test_dataset)}")
print(f"Number of node features: {train_dataset.num_node_features}")


Instantiating and processing datasets...
Copying ../data/raw\train.csv to ..\data\processed\gnn_v4\raw\train.csv


Processing...


Fitting Scaler:   0%|          | 0/2662 [00:00<?, ?it/s]

Processing SMILES:   0%|          | 0/2662 [00:00<?, ?it/s]

Copying ../data/raw\test.csv to ..\data\processed\gnn_v4\raw\test.csv


Done!
Processing...


Processing SMILES:   0%|          | 0/666 [00:00<?, ?it/s]

Done!


Datasets created. Number of training graphs: 2662
Number of testing graphs: 666
Number of node features: 6


In [None]:
print("\n--- Step 2: Defining a more Regularized GNN Model ---")

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels=128): 
        super(GCN, self).__init__()
        torch.manual_seed(42)
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.bn1 = BatchNorm1d(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels * 2)
        self.bn2 = BatchNorm1d(hidden_channels * 2)
        self.mlp = Sequential(
            Linear(hidden_channels * 2, hidden_channels),
            torch.nn.ReLU(),
            Linear(hidden_channels, 1)
        )

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index).relu()
        x = self.bn1(x)
        x = self.conv2(x, edge_index).relu()
        x = self.bn2(x)
        x = global_mean_pool(x, batch)
        x = F.dropout(x, p=0.4, training=self.training) 
        x = self.mlp(x)
        return x.squeeze()



--- Step 2: Defining a more Regularized GNN Model ---


In [None]:
# 3. Stabilized Training and Evaluation Pipeline
print("\n--- Step 3: Setting up the STABILIZED Training Pipeline ---")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

torch.manual_seed(42)
shuffled_dataset = train_dataset.shuffle()
train_size = int(0.85 * len(shuffled_dataset))
train_data, val_data = shuffled_dataset[:train_size], shuffled_dataset[train_size:]
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model = GCN(num_node_features=train_dataset.num_node_features).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4) 
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.7, patience=5, min_lr=1e-6)
criterion = torch.nn.L1Loss()



--- Step 3: Setting up the STABILIZED Training Pipeline ---
Using device: cuda


In [None]:
from sklearn.metrics import mean_absolute_error

def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(train_loader.dataset)

def evaluate(loader):
    model.eval()
    all_preds, all_reals = [], []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            all_preds.extend(np.exp(out.cpu().numpy()))
            all_reals.extend(np.exp(data.y.cpu().numpy()))
    return mean_absolute_error(all_reals, all_preds)

# 4. Run the Stabilized Training
print("\n--- Step 4: Starting STABILIZED GNN Model Training ---")

best_val_mae = float('inf')
patience_counter = 0
patience = 20

for epoch in range(1, 201):
    loss = train()
    val_mae = evaluate(val_loader)
    scheduler.step(val_mae)
    
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), 'best_gnn_model_v2.pt')
        patience_counter = 0
    else:
        patience_counter += 1
        
    lr = optimizer.param_groups[0]['lr']
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val MAE: {val_mae:.4f}, Best Val MAE: {best_val_mae:.4f}, LR: {lr:.6f}')
        
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch}.")
        break



--- Step 4: Starting STABILIZED GNN Model Training ---
Epoch: 001, Loss: 4.0308, Val MAE: 408.7085, Best Val MAE: 408.7085, LR: 0.000500
Epoch: 002, Loss: 1.2667, Val MAE: 823.5269, Best Val MAE: 408.7085, LR: 0.000500
Epoch: 003, Loss: 1.0187, Val MAE: 159.0577, Best Val MAE: 159.0577, LR: 0.000500
Epoch: 004, Loss: 1.0274, Val MAE: 128.4084, Best Val MAE: 128.4084, LR: 0.000500
Epoch: 005, Loss: 0.8890, Val MAE: 135.3870, Best Val MAE: 128.4084, LR: 0.000500
Epoch: 006, Loss: 0.8749, Val MAE: 113.2082, Best Val MAE: 113.2082, LR: 0.000500
Epoch: 007, Loss: 0.8564, Val MAE: 157.2214, Best Val MAE: 113.2082, LR: 0.000500
Epoch: 008, Loss: 0.7845, Val MAE: 94.6739, Best Val MAE: 94.6739, LR: 0.000500
Epoch: 009, Loss: 0.7563, Val MAE: 95.7280, Best Val MAE: 94.6739, LR: 0.000500
Epoch: 010, Loss: 0.7496, Val MAE: 112.2653, Best Val MAE: 94.6739, LR: 0.000500
Epoch: 011, Loss: 0.6744, Val MAE: 120.5371, Best Val MAE: 94.6739, LR: 0.000500
Epoch: 012, Loss: 0.6676, Val MAE: 100.6247, Bes