<a href="https://colab.research.google.com/github/Bharghavi-2006/India_Open_Hackathon/blob/main/code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
import numpy as np

ModuleNotFoundError: No module named 'torch_geometric'

In [None]:
def df_to_pyg_praph(df, feature_cols, label_col, k=8, impute_zero=True, add_missing_mask=True, seed=42):
    x = df[feature_cols].astype(float).copy()
    y = df[label_col].astype(int).values

    #Handle missing values
    missing_mask = X.isna().astype(int).values
    if impute_zero:
      X = X.fillna(0.0)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X.values)

    if add_missing_mask:
        X_all = np.hstack([X_scaled, missing_mask])
    else:
      X_all = X_scaled

    A = kneighbors_graph(X_scaled, n_neighbors=k, mode="connectivity", include_self=False, n_jobs=-1)
    A = A.maximum(A.T)

    row, col = A.nonzero()
    edge_index = torch.tensor(np.vstack([row, col]), dtype=torch.long)

    data = Data(
        x = torch.tensor(X_all, dtype=torch.float),
        y = torch.tensor(y, dtype=torch.long),
        edge_index = edge_index
    )

    rng = np.random.default_rng(seed)
    idx = np.arange(len(y))
    rng.shuffle(idx)
    n = len(idx)
    n_train, n_val = int(0.7*n), int(0.15*n)
    train_idx = idx[:n_train]
    val_idx = idx[n_train:n_train+n_val]
    test_idx = idx[n_train+n_val:]

    data.train_mask = torch.zeros(n, dtype=torch.bool); data.train_mask[train_idx] = True
    data.val_mask = torch.zeros(n, dtype=torch.bool); data.val_mask[val_idx]       = True
    data.test_mask = torch.zeros(n, dtype=torch.bool); data.test_mask[test_idx]    = True

    return data, scaler

In [None]:
#feature lists
electrolyte_features = [f"electrolyte_f{i}" for i in range(31)]

electrolyte_data, electrolyte_scaler = df_to_pyg_graph(df_electrolyte, electrolyte_features, "label", k=10)

In [None]:
class GCN(nn.Module):
    def __init__(self, in_dim, hidden=64, dropout=0.3):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden)
        self.conv2 = GCNConv(hidden, hidden)
        self.lin = nn.Linear(hidden, 2)
        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.lin(x)
        return x

In [None]:
electrolyte_in_dim = electrolyte_data.x.size(1)

electrolyte_model = GCN(electrolyte_in_dim, hidden=96, dropout=0.4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
electrolyte_data, electrolyte_model = electrolyte_data.to(device), electrolyte_model.to(device)

In [None]:
def train_gnn(data, model, lr=1e-3, weight_decay=5e-4, epochs=200, early_stop_patience=20):
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_acc, best_state, patience = 0.0, None, early_stop_patience

    def accuracy(logits, y):
        return (logits.argmax(dim=-1) == y).float().mean().item()

    for epoch in range(1, epochs+1):
        model.train()
        opt.zero_grad()
        out = model(data.x, data.edge_index)
        loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        opt.step()

        model.eval()
        with torch.no_grad():
            logits = model(data.x, data.edge_index)
            val_acc = accuracy(logits[data.val_mask], data.y[data.val_mask])

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
            patience = early_stop_patience
        else:
            patience -= 1
            if patience == 0:
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index)
        test_acc = (logits[data.test_mask].argmax(-1) == data.y[data.test_mask]).float().mean().item()

    return best_val_acc, test_acc, logits

In [None]:
electrolyte_val, electrolyte_test, electrolyte_logits = train_gnn(electrolyte_data, electrolyte_model, lr = 1e-3, epochs = 300)

print(f"Organic GNN - best val acc: {electrolyte_val:.3f}, test acc: {electrolyte_test}:.3f")