<a href="https://colab.research.google.com/github/abdulwasaeee/NovaMol/blob/main/NovaMol%2BChemBL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================================================================
# Project: NovaMol 2.0 - A Drug Discovery Engine with ChEMBL
#
# Objective:
# This pipeline specializes in pharmaceutical discovery. It uses the ChEMBL
# database to train a multi-task GNN to predict drug-like properties and
# bioactivity. The goal is to generate novel molecules and evaluate their
# potential as high-quality drug candidates.
# =============================================================================

# --- Step 1: Setup and Installations ---
import subprocess
import sys

def install_packages():
    """Installs required packages, including the ChEMBL client."""
    print("--- Checking and installing dependencies ---")
    standard_packages = [
        "rdkit", "pandas", "scikit-learn", "tqdm", "torch",
        "torchvision", "torchaudio", "selfies", "chembl_webresource_client"
    ]
    for package in standard_packages:
        try:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])
        except subprocess.CalledProcessError:
            print(f"ERROR: Failed to install {package}. Please try installing it manually.")
            sys.exit(1)
    try:
        import torch
        TORCH_VERSION = torch.__version__.split('+')[0]
        CUDA_VERSION = torch.version.cuda
        CUDA_STR = f"cu{CUDA_VERSION.replace('.', '')}" if CUDA_VERSION else 'cpu'
        print(f"Detected PyTorch {TORCH_VERSION} and device type {CUDA_STR}.")
        PYG_URL = f'https://data.pyg.org/whl/torch-{TORCH_VERSION}+{CUDA_STR}.html'
        pyg_packages = ['torch-scatter', 'torch-sparse', 'torch-cluster', 'torch-geometric']
        for package in pyg_packages:
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package, '-f', PYG_URL])
        print("--- All dependencies installed successfully. ---")
    except Exception as e:
        print(f"ERROR: Failed to install PyG packages: {e}")
        sys.exit(1)

install_packages()

# --- Step 2: Imports and Global Configuration ---
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GINConv, global_add_pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from chembl_webresource_client.new_client import new_client
import selfies as sf
import requests

# --- Configuration for ChEMBL ---
PROPERTIES_TO_PREDICT = ['pchembl_value', 'logp', 'molecular_weight']
N_PROPERTIES = len(PROPERTIES_TO_PREDICT)
N_MOLECULES_GNN = 10000
N_MOLECULES_RNN = 20000
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
N_EPOCHS_GNN = 50
N_EPOCHS_RNN = 30
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- Step 3: Define Helper Functions ---

def get_chembl_data(target_id='CHEMBL203', min_pchembl=5.0):
    """Downloads and processes bioactivity data from ChEMBL for a specific target."""
    print(f"--- Downloading data for target {target_id} from ChEMBL ---")
    activity = new_client.activity
    res = activity.filter(target_chembl_id=target_id, standard_type="IC50", pchembl_value__gte=min_pchembl)
    df = pd.DataFrame(res)

    print("these is the head of the dataset: before processing")
    print(df.head())

    df = df[['canonical_smiles', 'pchembl_value']]
    df = df.dropna().drop_duplicates(subset=['canonical_smiles'])
    df['pchembl_value'] = pd.to_numeric(df['pchembl_value'])

    print(f"Downloaded and cleaned {len(df)} unique, active compounds.")

    molecules = [Chem.MolFromSmiles(smi) for smi in df['canonical_smiles']]
    df['logp'] = [Descriptors.MolLogP(m) if m else None for m in molecules]
    df['molecular_weight'] = [Descriptors.MolWt(m) if m else None for m in molecules]

    df.dropna(inplace=True)

    print("these is the head of the dataset: after processing")
    print(df.head())

    df = df.rename(columns={'canonical_smiles': 'smiles'})
    print(f"Processed down to {len(df)} compounds with all required properties.")
    return df

def smiles_to_graph(smiles: str):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None: return None
    atom_features = [[
        atom.GetAtomicNum(), atom.GetFormalCharge(), float(atom.GetHybridization()),
        float(atom.GetIsAromatic()), atom.GetTotalNumHs(), atom.GetTotalValence()
    ] for atom in mol.GetAtoms()]
    x = torch.tensor(atom_features, dtype=torch.float)
    if mol.GetNumBonds() > 0:
        row, col = [], []
        for bond in mol.GetBonds():
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            row.extend([start, end]); col.extend([end, start])
        edge_index = torch.tensor([row, col], dtype=torch.long)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)
    return Data(x=x, edge_index=edge_index)

def evaluate_multitask_gnn(loader, model, scalers):
    """Evaluates the Multi-Task GNN and returns MAE for each property."""
    model.eval()
    predictions, targets_list = [], []
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(DEVICE)
            out = model(batch)
            predictions.append(out.cpu().numpy())
            targets_list.append(batch.y.cpu().numpy())

    predictions = np.vstack(predictions)

    # SHAPE ERROR FIX: Concatenate the list of flat target arrays and then reshape
    targets = np.concatenate(targets_list).reshape(predictions.shape)

    maes = {}
    for i, prop in enumerate(PROPERTIES_TO_PREDICT):
        pred_real = scalers[prop].inverse_transform(predictions[:, i].reshape(-1, 1)).flatten()
        targ_real = scalers[prop].inverse_transform(targets[:, i].reshape(-1, 1)).flatten()
        maes[prop] = np.mean(np.abs(pred_real - targ_real))
    return maes


def analyze_novel_molecule_robust(smiles: str):
    results = {"Complexity_Score": "N/A", "Flags": []}
    try:
        mol = Chem.MolFromSmiles(smiles)
        if not mol: return results
        num_rings = rdMolDescriptors.CalcNumRings(mol)
        num_spiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
        num_bridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
        complexity_score = (num_rings * 1.0 + num_spiro * 2.5 + num_bridgehead * 2.5)
        results["Complexity_Score"] = f"{complexity_score:.2f}"
    except: pass
    return results

def suggest_drug_candidate_potential(predicted_properties: dict, calculated_properties: dict):
    """Provides a detailed, multi-level assessment of a molecule's drug potential."""
    insights, flags = [], []
    pchembl = predicted_properties.get('pchembl_value', 0)
    logp = predicted_properties.get('logp', 99)
    mw = predicted_properties.get('molecular_weight', 999)

    if pchembl >= 8.0: insights.append("Exceptional Potency (Lead Candidate)")
    elif pchembl >= 7.0: insights.append("High Potency (Strong Hit)")
    elif pchembl >= 6.0: insights.append("Good Potency (Promising Hit)")
    elif pchembl >= 5.0: insights.append("Active (Hit, Needs Optimization)")
    else: flags.append("Low Predicted Potency")

    if mw > 500: flags.append("Poor Oral Bioavailability (MW > 500)")
    elif mw < 200: insights.append("Fragment-like Size (Good for Fragment-Based Design)")
    else: insights.append("Ideal Drug-like Size")

    if logp > 5.0: flags.append("Poor Solubility (logP > 5)")
    elif logp < 0: flags.append("Too Polar (Poor Permeability)")
    elif 1.0 <= logp <= 3.5: insights.append("Optimal Solubility Profile")
    else: insights.append("Acceptable Solubility")

    if not flags and "High Potency" in " ".join(insights) and "Ideal" in " ".join(insights):
        final_assessment = "STRONG CANDIDATE: " + ", ".join(insights)
    elif not flags and "Promising" in " ".join(insights):
        final_assessment = "PROMISING CANDIDATE: " + ", ".join(insights)
    else:
        final_assessment = ", ".join(insights)

    if flags:
        final_assessment += " | FLAGS: " + ", ".join(flags)

    return final_assessment if final_assessment else "General Bioactive Compound"

# --- Step 4: Define Model Architectures ---
class MultiTaskGNN(nn.Module):
    def __init__(self, in_dim, hidden_dim=128, out_dim=3):
        super().__init__()
        nn1 = nn.Sequential(nn.Linear(in_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim))
        self.conv1 = GINConv(nn1)
        nn2 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim))
        self.conv2 = GINConv(nn2)
        self.lin1 = nn.Linear(hidden_dim, hidden_dim)
        self.lin2 = nn.Linear(hidden_dim, out_dim)
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()
        x = global_add_pool(x, batch)
        x = self.lin1(x).relu()
        return self.lin2(x)

class SELFIES_RNN(nn.Module):
    def __init__(self, vocab_size, emb_size=128, hidden_size=512, num_layers=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.rnn = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, vocab_size)
    def forward(self, x, hidden=None):
        emb = self.embedding(x); out, hidden = self.rnn(emb, hidden)
        return self.fc(out), hidden

def sample_selfies(model, token2idx, idx2token, max_len=50, temperature=1.0):
    model.eval()
    start_token = '[C]'
    x = torch.tensor([[token2idx[start_token]]], device=DEVICE)
    hidden = None
    tokens = [start_token]
    for _ in range(max_len):
        out, hidden = model(x, hidden)
        probs = F.softmax(out.squeeze() / temperature, dim=-1)
        idx = torch.multinomial(probs, 1).item()
        if idx == 0: break
        tokens.append(idx2token[idx])
        x = torch.tensor([[idx]], device=DEVICE)
    try: return sf.decoder(''.join(tokens))
    except: return None

# =============================================================================
# Main Execution Block
# =============================================================================
if __name__ == "__main__":
    print(f"\nProject starting on device: {DEVICE}\n")
    df_main = get_chembl_data()

    print("\n--- 1. Preparing Data for Multi-Task GNN ---")
    gnn_data_list = []
    subset_df_gnn = df_main.head(N_MOLECULES_GNN)
    for _, row in tqdm(subset_df_gnn.iterrows(), total=subset_df_gnn.shape[0], desc="Creating GNN graphs"):
        graph = smiles_to_graph(row['smiles'])
        if graph:
            graph.y = torch.tensor([row[p] for p in PROPERTIES_TO_PREDICT], dtype=torch.float)
            graph.smiles = row['smiles']
            gnn_data_list.append(graph)

    train_val_data, test_data = train_test_split(gnn_data_list, test_size=0.15, random_state=42)
    train_data, val_data = train_test_split(train_val_data, test_size=0.17, random_state=42)

    scalers = {}
    for i, prop in enumerate(PROPERTIES_TO_PREDICT):
        targets = np.array([d.y[i].item() for d in train_data]).reshape(-1, 1)
        scalers[prop] = StandardScaler().fit(targets)
        for d in train_val_data + test_data:
            d.y[i] = torch.tensor(scalers[prop].transform([[d.y[i].item()]])[0,0])

    train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)
    test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)
    print(f"Data split: {len(train_data)} train, {len(val_data)} validation, {len(test_data)} test.")

    print("\n--- 2. Training Multi-Task Predictive GNN ---")
    gnn_model = MultiTaskGNN(in_dim=train_data[0].num_node_features, out_dim=N_PROPERTIES).to(DEVICE)
    optimizer_gnn = torch.optim.Adam(gnn_model.parameters(), lr=LEARNING_RATE)
    loss_fn_gnn = nn.MSELoss()
    best_val_mae_sum = float('inf')
    for epoch in range(1, N_EPOCHS_GNN + 1):
        gnn_model.train()
        for batch in train_loader:
            batch = batch.to(DEVICE); optimizer_gnn.zero_grad()
            out = gnn_model(batch)
            target = batch.y.view_as(out)
            loss = loss_fn_gnn(out, target)
            loss.backward(); optimizer_gnn.step()
        val_maes = evaluate_multitask_gnn(val_loader, gnn_model, scalers)
        current_mae_sum = sum(val_maes.values())
        if current_mae_sum < best_val_mae_sum:
            best_val_mae_sum = current_mae_sum
            torch.save(gnn_model.state_dict(), 'best_gnn_model.pth')
        print(f"GNN Epoch {epoch:02d} | Val MAEs: " + ", ".join([f"{k[:10]}={v:.3f}" for k,v in val_maes.items()]))

    gnn_model.load_state_dict(torch.load('best_gnn_model.pth'))

    print("\n--- 3. Training Generative RNN ---")
    selfies_list = [sf.encoder(smi) for smi in tqdm(df_main['smiles'].head(N_MOLECULES_RNN), desc="Encoding to SELFIES") if smi and sf.encoder(smi)]
    all_tokens = set(t for s in selfies_list for t in sf.split_selfies(s))
    token2idx = {t: i + 1 for i, t in enumerate(sorted(all_tokens))}; token2idx['<PAD>'] = 0
    idx2token = {i: t for t, i in token2idx.items()}; vocab_size = len(token2idx)
    max_len = max(len(list(sf.split_selfies(s))) for s in selfies_list if s)
    selfies_tensor = torch.stack([torch.tensor([token2idx.get(t, 0) for t in list(sf.split_selfies(s))] + [0] * (max_len - len(list(sf.split_selfies(s)))), dtype=torch.long) for s in selfies_list])
    rnn_dataset = torch.utils.data.TensorDataset(selfies_tensor[:, :-1], selfies_tensor[:, 1:])
    rnn_loader = DataLoader(rnn_dataset, batch_size=128, shuffle=True, drop_last=True)
    rnn_model = SELFIES_RNN(vocab_size).to(DEVICE)
    opt_rnn = torch.optim.Adam(rnn_model.parameters(), lr=1e-3)
    for epoch in range(1, N_EPOCHS_RNN + 1):
        rnn_model.train()
        for x, y in rnn_loader:
            # CORRECTED LINE: Move both x and y to the device
            x, y = x.to(DEVICE), y.to(DEVICE)
            opt_rnn.zero_grad()
            out, _ = rnn_model(x)
            loss = F.cross_entropy(out.reshape(-1, vocab_size), y.reshape(-1), ignore_index=0)
            loss.backward()
            opt_rnn.step()
        print(f"RNN Epoch {epoch:02d}, Loss: {loss.item():.4f}")

    print("\n" + "="*80)
    print("                    FINAL DRUG DISCOVERY ANALYSIS")
    print("="*80)

    print("\n--- A. GNN PERFORMANCE ON UNSEEN TEST DATA ---")
    test_maes = evaluate_multitask_gnn(test_loader, gnn_model, scalers)
    for prop, mae in test_maes.items():
        print(f"  - MAE for {prop:<20}: {mae:.4f}")

    print("\n--- B. DRUG CANDIDATE ANALYSIS OF NOVEL MOLECULES ---")
    generated_smiles = [sample_selfies(rnn_model, token2idx, idx2token, temperature=0.95) for _ in tqdm(range(200), desc="Generating Molecules")]
    valid_smiles = [s for s in generated_smiles if s and Chem.MolFromSmiles(s)]
    novel_molecules_smiles = valid_smiles

    if novel_molecules_smiles:
        novel_graphs = [smiles_to_graph(s) for s in novel_molecules_smiles]
        valid_novel_graphs_data = [(smi, g) for smi, g in zip(novel_molecules_smiles, novel_graphs) if g is not None]

        if valid_novel_graphs_data:
            smiles_for_analysis = [smi for smi, g in valid_novel_graphs_data]
            graphs_for_analysis = [g for smi, g in valid_novel_graphs_data]
            predict_loader = DataLoader(graphs_for_analysis, batch_size=len(graphs_for_analysis))
            batch = next(iter(predict_loader)).to(DEVICE)
            preds_scaled_novel = gnn_model(batch).cpu().detach().numpy()

            analysis_results = []
            for i, smiles in enumerate(smiles_for_analysis):
                predicted_props = {prop: scalers[prop].inverse_transform(preds_scaled_novel[i, j].reshape(1, -1))[0,0] for j, prop in enumerate(PROPERTIES_TO_PREDICT)}
                complexity_data = analyze_novel_molecule_robust(smiles)
                assessment = suggest_drug_candidate_potential(predicted_props, complexity_data)

                analysis_results.append([
                    smiles,
                    f"{predicted_props['pchembl_value']:.2f}",
                    f"{predicted_props['logp']:.2f}",
                    f"{predicted_props['molecular_weight']:.2f}",
                    complexity_data["Complexity_Score"],
                    assessment
                ])

            headers = ["Novel SMILES", "Pred. pChEMBL", "Pred. logP", "Pred. MW", "Complexity Score", "Drug Candidate Assessment"]
            df_analysis = pd.DataFrame(analysis_results, columns=headers)
            print(df_analysis.head(15).to_string(index=False))
    else:
        print("No valid novel molecules were generated to analyze.")

    print("\n" + "="*80)
    print("                              PROJECT COMPLETE")
    print("="*80)



--- Checking and installing dependencies ---
Detected PyTorch 2.8.0 and device type cu126.
--- All dependencies installed successfully. ---

Project starting on device: cuda

--- Downloading data for target CHEMBL203 from ChEMBL ---
these is the head of the dataset: before processing
  action_type activity_comment  activity_id activity_properties  \
0        None             None        32260                  []   
1        None             None        32263                  []   
2        None             None        32265                  []   
3        None             None        32267                  []   
4        None             None        32270                  []   

  assay_chembl_id                                  assay_description  \
0    CHEMBL674637  Inhibitory activity towards tyrosine phosphory...   
1    CHEMBL621151  Inhibition of autophosphorylation of human epi...   
2    CHEMBL615325  Inhibition of ligand-induced proliferation in ...   
3    CHEMBL674637  Inhib

Creating GNN graphs: 100%|██████████| 9739/9739 [00:10<00:00, 923.03it/s]


Data split: 6870 train, 1408 validation, 1461 test.

--- 2. Training Multi-Task Predictive GNN ---
GNN Epoch 01 | Val MAEs: pchembl_va=0.860, logp=0.718, molecular_=57.237
GNN Epoch 02 | Val MAEs: pchembl_va=0.927, logp=0.631, molecular_=51.264
GNN Epoch 03 | Val MAEs: pchembl_va=0.824, logp=0.575, molecular_=47.355
GNN Epoch 04 | Val MAEs: pchembl_va=0.938, logp=0.679, molecular_=48.644
GNN Epoch 05 | Val MAEs: pchembl_va=0.799, logp=0.578, molecular_=42.336
GNN Epoch 06 | Val MAEs: pchembl_va=0.841, logp=0.653, molecular_=39.121
GNN Epoch 07 | Val MAEs: pchembl_va=0.789, logp=0.513, molecular_=37.571
GNN Epoch 08 | Val MAEs: pchembl_va=0.748, logp=0.490, molecular_=33.774
GNN Epoch 09 | Val MAEs: pchembl_va=0.757, logp=0.450, molecular_=34.491
GNN Epoch 10 | Val MAEs: pchembl_va=0.898, logp=0.483, molecular_=35.277
GNN Epoch 11 | Val MAEs: pchembl_va=0.720, logp=0.547, molecular_=33.798
GNN Epoch 12 | Val MAEs: pchembl_va=0.773, logp=0.659, molecular_=33.064
GNN Epoch 13 | Val MAEs: 

Encoding to SELFIES: 100%|██████████| 9739/9739 [00:09<00:00, 1053.01it/s]


RNN Epoch 01, Loss: 2.0646
RNN Epoch 02, Loss: 1.5604
RNN Epoch 03, Loss: 1.2316
RNN Epoch 04, Loss: 1.0630
RNN Epoch 05, Loss: 0.9073
RNN Epoch 06, Loss: 0.8361
RNN Epoch 07, Loss: 0.7683
RNN Epoch 08, Loss: 0.7028
RNN Epoch 09, Loss: 0.6762
RNN Epoch 10, Loss: 0.6468
RNN Epoch 11, Loss: 0.5853
RNN Epoch 12, Loss: 0.5824
RNN Epoch 13, Loss: 0.5381
RNN Epoch 14, Loss: 0.5292
RNN Epoch 15, Loss: 0.4640
RNN Epoch 16, Loss: 0.4703
RNN Epoch 17, Loss: 0.4281
RNN Epoch 18, Loss: 0.4276
RNN Epoch 19, Loss: 0.4146
RNN Epoch 20, Loss: 0.4197
RNN Epoch 21, Loss: 0.3632
RNN Epoch 22, Loss: 0.3891
RNN Epoch 23, Loss: 0.3774
RNN Epoch 24, Loss: 0.3621
RNN Epoch 25, Loss: 0.3622
RNN Epoch 26, Loss: 0.3591
RNN Epoch 27, Loss: 0.3360
RNN Epoch 28, Loss: 0.3445
RNN Epoch 29, Loss: 0.3258
RNN Epoch 30, Loss: 0.3271

                    FINAL DRUG DISCOVERY ANALYSIS

--- A. GNN PERFORMANCE ON UNSEEN TEST DATA ---
  - MAE for pchembl_value       : 0.6167
  - MAE for logp                : 0.2194
  - MAE f

Generating Molecules: 100%|██████████| 200/200 [00:06<00:00, 28.95it/s]
[08:59:22] Explicit valence for atom # 23 Br, 3, is greater than permitted
[08:59:22] Explicit valence for atom # 26 Br, 2, is greater than permitted


                                                   Novel SMILES Pred. pChEMBL Pred. logP Pred. MW Complexity Score                                                                                 Drug Candidate Assessment
               C1=CC(=O)NC=CCC=C1NC2=NC(NC3=CC=CC=C3)=NC=C2CC=C          5.77       4.02   391.36             3.00                             Active (Hit, Needs Optimization), Ideal Drug-like Size, Acceptable Solubility
            COC=CC(Cl)=CC(=C)OC=NONCC1=CC=C(Cl)C=C1C=CNC=CC=CCC          4.68       3.91   364.96             1.00                                Ideal Drug-like Size, Acceptable Solubility | FLAGS: Low Predicted Potency
                            CCCCCCNC(=O)COC=C1C(NCC2=CC=CC12)=N          6.76       2.99   420.07             2.00       PROMISING CANDIDATE: Good Potency (Promising Hit), Ideal Drug-like Size, Optimal Solubility Profile
      C1OCCOC=CC=NC=NC1(NC=C2C=C(OCC3=CC=CC=C3OC)C(Cl)=C2C)C=CN          7.74       2.09   425.80             3.00  

In [2]:
# =============================================================================
#                  Final Project Statistical Summary
# =============================================================================
#
# Objective:
# To provide a comprehensive, "full-fledged" summary of the entire project's
# performance, validating both the predictive GNN and the generative RNN
# with key statistics from the training and generation phases.
# =============================================================================

import numpy as np
import pandas as pd

# This cell assumes that all variables from the main pipeline script are in memory.

print("\n" + "="*80)
print("                    NOVA-MOL PROJECT: FINAL STATISTICAL REPORT")
print("="*80)

# --- 1. Dataset & Training Configuration ---
print("\n--- 1. DATASET & TRAINING CONFIGURATION ---")
# Ensure variables exist before printing
if 'DEVICE' in locals():
    print(f"  - AI Target Device:                  {str(DEVICE).upper()}")
if 'train_data' in locals():
    print(f"  - GNN Training Molecules:            {len(train_data)}")
if 'val_data' in locals():
    print(f"  - GNN Validation Molecules:          {len(val_data)}")
if 'test_data' in locals():
    print(f"  - GNN Test Molecules:                {len(test_data)}")
if 'N_EPOCHS_GNN' in locals():
    print(f"  - GNN Training Epochs:               {N_EPOCHS_GNN}")
if 'selfies_list' in locals():
    print(f"  --------------------------------------------------")
    print(f"  - RNN Training Molecules:            {len(selfies_list)}")
if 'N_EPOCHS_RNN' in locals():
    print(f"  - RNN Training Epochs:               {N_EPOCHS_RNN}")


# --- 2. Predictive Model (GNN) Validation ---
print("\n--- 2. PREDICTIVE GNN MODEL VALIDATION ---")
print("The GNN's performance was rigorously validated on the held-out, unseen Test Set.")
print("The Mean Absolute Error (MAE) measures the average error of our predictions.")
print("\n  **Final GNN Performance on Test Set:**")
if 'test_maes' in locals():
    for prop, mae in test_maes.items():
        print(f"    - MAE for {prop:<20}: {mae:.4f}")
else:
    print("    - GNN test performance data not available.")


# --- 3. Generative Model (RNN) Validation ---
print("\n--- 3. GENERATIVE RNN MODEL VALIDATION ---")
print("The RNN's performance is validated by the quality and novelty of the molecules it creates.")

if 'valid_smiles' in locals():
    total_generated = len(valid_smiles)
    num_novel = len(novel_molecules_smiles) # In this script, novel_molecules_smiles is currently all valid_smiles

    print(f"\n  **Generation Statistics:**")
    print(f"    - Total Valid Molecules Generated:   {total_generated}")
    # The novelty check against existing molecules (e.g., PubChem) is not implemented.
    # Reporting 'Novel Molecules Discovered' based on just the valid generated count
    # as the comparison to an existing database is not performed.
    print(f"    - Novel Molecules Reported:          {num_novel} (Note: No external novelty check performed)")

    if num_novel > 0:
        print("\n  **Conclusion:** The RNN has successfully generated valid molecular structures.")
        print("  Further analysis (e.g., external database search for novelty, in vitro testing)")
        print("  is required to confirm true novelty and potential as drug candidates.")
    else:
         print("\n  **Conclusion:** The RNN did not generate any valid molecules in this run.")
else:
    print("\n  **Generation Statistics:**")
    print("    - No valid molecules were generated in this run.")


print("\n" + "="*80)
print("                              END OF REPORT")
print("="*80)


                    NOVA-MOL PROJECT: FINAL STATISTICAL REPORT

--- 1. DATASET & TRAINING CONFIGURATION ---
  - AI Target Device:                  CUDA
  - GNN Training Molecules:            6870
  - GNN Validation Molecules:          1408
  - GNN Test Molecules:                1461
  - GNN Training Epochs:               50
  --------------------------------------------------
  - RNN Training Molecules:            9739
  - RNN Training Epochs:               30

--- 2. PREDICTIVE GNN MODEL VALIDATION ---
The GNN's performance was rigorously validated on the held-out, unseen Test Set.
The Mean Absolute Error (MAE) measures the average error of our predictions.

  **Final GNN Performance on Test Set:**
    - MAE for pchembl_value       : 0.6167
    - MAE for logp                : 0.2194
    - MAE for molecular_weight    : 14.2096

--- 3. GENERATIVE RNN MODEL VALIDATION ---
The RNN's performance is validated by the quality and novelty of the molecules it creates.

  **Generation Statistic

In [3]:
# =============================================================================
#         GNN Performance Spotlight (Predicted vs. Actual)
# =============================================================================
#
# Objective:
# To provide a clear, intuitive proof of the GNN's accuracy, this script
# takes a random sample of molecules from the unseen test set and displays
# their predicted properties side-by-side with the true, known values
# from the dataset.
# =============================================================================

import pandas as pd
import numpy as np
from IPython.display import display, HTML
import torch

# This cell assumes the main script has run and these variables are in memory:
# - test_data: The held-out test set of graph objects.
# - gnn_model: The trained GNN model.
# - scalers: The dictionary of fitted StandardScaler objects.
# - PROPERTIES_TO_PREDICT: List of property names.
# - DEVICE: The active torch device ('cuda' or 'cpu').
# - DataLoader: PyG DataLoader

print("\n" + "="*80)
print("              GNN Performance Spotlight: Predicted vs. Actual Values")
print("="*80)

if 'test_data' not in locals() or not test_data:
    print("\n'test_data' not found. Please ensure the main pipeline has been run successfully.")
elif 'gnn_model' not in locals():
    print("\n'gnn_model' not found. Please ensure the GNN model has been trained.")
elif 'scalers' not in locals():
     print("\n'scalers' not found. Please ensure the scalers were fitted.")
else:
    # --- 1. Take a random sample from the test set ---
    sample_size = min(15, len(test_data))
    sample_indices = np.random.choice(len(test_data), sample_size, replace=False)
    sample_data = [test_data[i] for i in sample_indices]

    sample_loader = DataLoader(sample_data, batch_size=sample_size)

    # --- 2. Predict with the GNN ---
    gnn_model.eval()
    predictions_scaled = []
    actual_scaled = []
    with torch.no_grad():
        for batch in sample_loader:
            batch = batch.to(DEVICE)
            out = gnn_model(batch)
            predictions_scaled.append(out.cpu().numpy())
            # Reshape batch.y to (batch_size, N_PROPERTIES) before appending
            actual_scaled.append(batch.y.cpu().numpy().reshape(-1, len(PROPERTIES_TO_PREDICT)))

    predictions_scaled = np.vstack(predictions_scaled)
    actual_scaled = np.vstack(actual_scaled)


    # --- 3. Un-scale predictions and actual values and build the comparison table ---
    comparison_results = []
    headers = ["SMILES"]
    for prop in PROPERTIES_TO_PREDICT:
        headers.append(f"Pred. {prop}")
        headers.append(f"Actual {prop}")
        headers.append(f"Abs Error ({prop})")

    for i in range(sample_size):
        row_data = [sample_data[i].smiles]
        for j, prop in enumerate(PROPERTIES_TO_PREDICT):
            predicted_real = scalers[prop].inverse_transform(predictions_scaled[i, j].reshape(1, -1))[0,0]
            actual_real = scalers[prop].inverse_transform(actual_scaled[i, j].reshape(1, -1))[0,0]
            error = abs(predicted_real - actual_real)
            row_data.extend([f"{predicted_real:.4f}", f"{actual_real:.4f}", f"{error:.4f}"])
        comparison_results.append(row_data)


    df_comparison = pd.DataFrame(comparison_results, columns=headers)

    # --- 4. Display the final report ---
    print(f"\n--- Comparison Table for {sample_size} Random Molecules from the Unseen Test Set ---")

    # Use display(HTML(...)) for clean notebook formatting
    display(HTML(df_comparison.to_html(index=False, justify='left')))

print("\n" + "="*80)


              GNN Performance Spotlight: Predicted vs. Actual Values

--- Comparison Table for 15 Random Molecules from the Unseen Test Set ---


SMILES,Pred. pchembl_value,Actual pchembl_value,Abs Error (pchembl_value),Pred. logp,Actual logp,Abs Error (logp),Pred. molecular_weight,Actual molecular_weight,Abs Error (molecular_weight)
CCCCCCCCCCNC(=O)COc1cc(O)c2c(=O)cc(-c3ccccc3)oc2c1,5.5812,5.67,0.0889,5.7954,5.8013,0.0059,446.1235,451.563,5.4395
C=CC(=O)N1CC[C@H](n2cc(-c3ccncc3)c(-c3cccc(/C=C/c4ncc(C5CC5)o4)c3)n2)C1,5.3539,8.22,2.8661,5.267,5.6073,0.3403,491.4383,477.568,13.8703
CC(C)(O)Cn1/c(=N/C(=O)c2ccnc(-c3cccnc3)c2)[nH]c2cc(CC(=O)N3CCCCC3)ccc21,7.4393,6.75,0.6893,3.6958,3.4934,0.2024,525.3917,512.614,12.7777
O=C(/C=C/CN1CCCCC1)N1CCOc2c1ccc1ncnc(Nc3ccc(Oc4ccn5ncnc5c4)c(Cl)c3)c21,6.43,6.56,0.13,5.3964,5.6293,0.2329,558.7992,597.079,38.2798
Cc1ccc(C2=NN(C3=NC(=O)CS3)C(c3ccccc3)C2)cc1,6.0426,5.97,0.0726,3.598,3.7755,0.1776,326.627,335.432,8.805
Oc1c(I)cc(I)c2cccnc12,7.5814,5.72,1.8614,2.2287,3.1496,0.9209,387.6527,396.953,9.3003
CCCCOc1ncnc2[nH]c(-c3ccc(N4CCN(C)CC4)cc3)c(-c3cccc(NC(=O)CC)c3)c12,7.3147,8.03,0.7153,5.5099,5.5711,0.0612,521.1715,512.658,8.5135
COc1cc2c(Oc3ccc(NC(=O)c4c(C)n(C)n(-c5ccccc5)c4=O)cc3F)ccnc2cc1OCCCN1CCN(C)CC1,6.7867,5.12,1.6667,5.0627,5.2412,0.1785,657.3498,654.743,2.6068
CCN1CCC(N2C=C(C(Nc3cc(Cl)c4ncc(C#N)c(Nc5ccc(F)c(Cl)c5)c4c3)c3cncnc3)NN2)CC1,8.5598,8.3,0.2598,5.9028,5.8897,0.0132,593.236,619.536,26.3
CCCN(CC#CC(=O)Nc1ccc2ncc(C#N)c(Nc3cccc(Br)c3)c2c1)CCC,6.1682,6.89,0.7218,5.2052,5.6765,0.4713,472.6997,504.432,31.7323



