In [2]:
from dscribe.descriptors import SOAP
from ase import Atoms
import numpy as np

In [1]:
#import dataset 

import os
import pandas as pd

# === Paths ===
BASE_DIR = 'molecular-property-prediction-challenge'
DIPOLE_FILE = os.path.join(BASE_DIR, 'dipole_moments_train.csv')
STRUCTURE_DIR = os.path.join(BASE_DIR, 'structures_train')  #for test can change dir later

# === Load dipole moment labels ===
dipole_df = pd.read_csv(DIPOLE_FILE)

# === Function to load .xyz file and return DataFrame ===
def parse_xyz(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
    
    num_atoms = int(lines[0].strip())
    atom_lines = lines[2:2 + num_atoms]
    
    data = []
    for line in atom_lines:
        parts = line.strip().split()
        atom = parts[0]
        x, y, z = map(float, parts[1:])
        data.append((atom, x, y, z))
        
    return pd.DataFrame(data, columns=['atom', 'x', 'y', 'z'])

# === Load all structure files into a single DataFrame ===
def load_structures(structure_dir):
    all_data = []
    
    for filename in os.listdir(structure_dir):
        if filename.endswith('.xyz'):
            mol_name = filename.replace('.xyz', '')
            filepath = os.path.join(structure_dir, filename)
            df = parse_xyz(filepath)
            df['molecule_name'] = mol_name
            df['atom_index'] = range(len(df))  #keeping track of atom index, this is not informative 
            all_data.append(df)
    
    structures_df = pd.concat(all_data, ignore_index=True)
    return structures_df

# === Load training structure data ===
train_structures = load_structures(STRUCTURE_DIR)

# === Merge dipole moment target with training structures ===
train_df = train_structures.merge(dipole_df, on='molecule_name')

# Implementation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data, Dataset
from torch_geometric.nn import radius_graph
from torch_geometric.loader import DataLoader

import pandas as pd
import numpy as np

In [29]:
class MoleculeDataset(Dataset):
    def __init__(self, df):
        self.molecules = []
        grouped = df.groupby('molecule_name')

        for mol_name, group in grouped:
            group = group.sort_values('atom_index')  # Ensure consistent order

            # Node features: simple atom embedding (e.g., atomic number)
            atom_types = group['atom'].tolist()
            atom_numbers = [self.atom_to_num(a) for a in atom_types]
            x = torch.tensor(atom_numbers, dtype=torch.float).unsqueeze(1)  # shape: [num_atoms, 1]

            # Node positions
            pos = torch.tensor(group[['x', 'y', 'z']].values, dtype=torch.float)

            #####
            # Edge index (fully connected graph for simplicity)
            num_atoms = len(group)
            edges = [(i, j) for i in range(num_atoms) for j in range(num_atoms) if i != j]
            row, col = zip(*edges)  # unzip into two tuples

            row = torch.tensor(row, dtype=torch.long)
            col = torch.tensor(col, dtype=torch.long)

            edge_index = torch.stack([row, col], dim=0)

            # Edge attributes (relative distance vectors)
            pos_i = pos[row]
            pos_j = pos[col]
            edge_attr = pos_j - pos_i  # relative position vectors

            # Target (dipole moment vector)
            target = torch.tensor([group['dipole_moment'].values[0]], dtype=torch.float)

            data = Data(x=x, pos=pos, edge_index=edge_index.t().contiguous(), edge_attr=edge_attr, y=target)
            self.molecules.append(data)

    def __len__(self):
        return len(self.molecules)

    def __getitem__(self, idx):
        return self.molecules[idx]

    def atom_to_num(self, atom):
        periodic_table = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}  # extend as needed
        return periodic_table.get(atom, 0)  # fallback to 0 if unknown


In [50]:
# ----- Step 2: EDIGNN Model -----
import torch
import torch.nn as nn

class EDIGNNLayer(nn.Module):
    def __init__(self, in_channels, edge_dim, hidden_dim):
        super().__init__()
        
        self.in_channels = in_channels
        self.edge_dim = edge_dim
        self.hidden_dim = hidden_dim

        input_dim = 2 * in_channels + edge_dim  # this must match actual input shape
        print(f"Initializing EDIGNNLayer with message_mlp input_dim = {input_dim}")

        self.message_mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

        update_input_dim = in_channels + hidden_dim
        print(f"Initializing EDIGNNLayer with update_mlp input_dim = {update_input_dim}")

        self.update_mlp = nn.Sequential(
            nn.Linear(update_input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, in_channels)
        )


    def forward(self, x, edge_index, edge_attr):
        edge_index = edge_index.T  # from [num_edges, 2] → [2, num_edges]

        row = edge_index[0]  # source
        col = edge_index[1]  # target / receiver

        h_i = x[row]  # source node features
        h_j = x[col]  # target node features

        # shape check:
        # h_i: [num_edges, in_channels]
        # h_j: [num_edges, in_channels]
        # edge_attr: [num_edges, edge_dim]
        print(f"edge_index shape: {edge_index.shape}")  # should be [2, num_edges]


        print(f"h_i shape: {h_i.shape}")
        print(f"h_j shape: {h_j.shape}")
        print(f"edge_attr shape: {edge_attr.shape}")


        edge_input = torch.cat([h_i, h_j, edge_attr], dim=-1)
        m_ij = self.message_mlp(edge_input)

        # Aggregate messages to the target node (col)
        agg = torch.zeros_like(x)
        agg = agg.index_add(0, col, m_ij)

        # Update node features
        x = self.update_mlp(torch.cat([x, agg], dim=-1))

        return x


class EDIGNN(nn.Module):
    def __init__(self, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Linear(1, hidden_dim)  # atomic number to hidden
        self.gnn1 = EDIGNNLayer(hidden_dim, 1, hidden_dim)
        self.gnn2 = EDIGNNLayer(hidden_dim, 1, hidden_dim)
        self.gnn3 = EDIGNNLayer(hidden_dim, 1, hidden_dim)

        self.readout = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 3)  # x, y, z components of dipole
        )

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        x = self.embedding(x)
        x = self.gnn1(x, edge_index, edge_attr)
        x = self.gnn2(x, edge_index, edge_attr)
        x = self.gnn3(x, edge_index, edge_attr)
        dipole_vectors = self.readout(x)
        mu = dipole_vectors.sum(dim=0)
        return mu

In [51]:
from tqdm import tqdm

# ----- Step 4: Setup -----
# Assuming you already have train_df loaded
full_dataset = MoleculeDataset(train_df)


split = int(0.8 * len(full_dataset))
train_dataset = full_dataset[:split]
val_dataset = full_dataset[split:]
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

model = EDIGNN()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Updated train function with tqdm
def train(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0

    for graph in tqdm(loader, desc="Training", leave=False):
        optimizer.zero_grad()
        output = model(graph)
        loss = criterion(output, graph.y)  # access target via graph.y
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    return running_loss / len(loader)


# Updated evaluate function with tqdm
def evaluate(model, loader):
    model.eval()
    total_error = 0.0
    with torch.no_grad():
        for graph in tqdm(loader, desc="Evaluating", leave=False):
            output = model(graph)
            error = torch.abs(output - graph.y).sum().item()
            total_error += error
    return total_error / len(loader)


# ----- Step 5: Train -----
for epoch in range(20):
    loss = train(model, train_loader, optimizer, criterion)
    val_mae = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}: Train Loss = {loss:.4f}, Val MAE = {val_mae:.4f}")


Initializing EDIGNNLayer with message_mlp input_dim = 129
Initializing EDIGNNLayer with update_mlp input_dim = 128
Initializing EDIGNNLayer with message_mlp input_dim = 129
Initializing EDIGNNLayer with update_mlp input_dim = 128
Initializing EDIGNNLayer with message_mlp input_dim = 129
Initializing EDIGNNLayer with update_mlp input_dim = 128


                                                   

edge_index shape: torch.Size([2, 240])
h_i shape: torch.Size([240, 64])
h_j shape: torch.Size([240, 64])
edge_attr shape: torch.Size([240, 3])




RuntimeError: mat1 and mat2 shapes cannot be multiplied (240x131 and 129x64)

In [23]:
print(train_df.columns.tolist())


['atom', 'x', 'y', 'z', 'molecule_name', 'atom_index', 'dipole_moment']
