In [1]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
canonical_file="../dataset/canonical_trainset.csv"
canonical_smiles_df=pd.read_csv(canonical_file)

# get 2D graph_data

In [3]:
import os
import sys
import nbimporter

project_root = os.path.join(os.getcwd(), '..')
sys.path.append(project_root)

from datapreparation import Data_Processing_2D as nb2


graph_data = nb2.preprocess_smiles_with_labels(canonical_smiles_df["SMILES"][:50].values, canonical_smiles_df["Label"][:50].values)

# process nodes and adjacency_matrix

In [4]:
import torch
from torch_geometric.data import Data

def get_torch_graph_data_list(graph_data):
    torch_graph_data_list = []
    for item in graph_data:
        edge_index = torch.tensor((np.array(item['adjacency_matrix'].nonzero())), dtype=torch.long)

        x = torch.tensor(item['nodes_features'], dtype=torch.float)
        y = torch.tensor([item['label']], dtype=torch.long)
        data_object = Data(x=x, edge_index=edge_index, y=y)
        torch_graph_data_list.append(data_object)
    return torch_graph_data_list


# load dataset

In [5]:
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader

torch_graph_data_list=get_torch_graph_data_list(graph_data)
graph_data_train, graph_data_test = train_test_split(torch_graph_data_list, test_size=0.2, random_state=42)

train_loader = DataLoader(graph_data_train, batch_size=4, shuffle=True)
test_loader = DataLoader(graph_data_test, batch_size=4, shuffle=False)


# Build model

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.loader import DataLoader


class GCN(torch.nn.Module):
    def __init__(self, num_node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 32)
        self.fc = nn.Linear(32, 2)  

    def forward(self, data, return_feature=False):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        
        if return_feature:
            return x
        
        x = self.fc(x)
        return x
    

# train and evaluate model

In [7]:
num_node_features = 12 
model = GCN(num_node_features=num_node_features)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()


def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, total_correct, total = 0, 0, 0
    for data in loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, F.one_hot(data.y, num_classes=2).to(torch.float))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(output, 1)
        total_correct += (predicted == data.y).sum().item()
        total += data.y.size(0)
    return total_loss / len(loader), total_correct / total

def validate_epoch(model, loader, criterion):
    model.eval()
    total_loss, total_correct, total = 0, 0, 0
    with torch.no_grad():
        for data in loader:
            output = model(data)
            loss = criterion(output, F.one_hot(data.y, num_classes=2).to(torch.float))
            total_loss += loss.item()
            _, predicted = torch.max(output, 1)
            total_correct += (predicted == data.y).sum().item()
            total += data.y.size(0)
    return total_loss / len(loader), total_correct / total

best_val_accuracy = 0
for epoch in range(10):
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_accuracy = validate_epoch(model, test_loader, criterion)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best2_model.pth')


Epoch 1, Train Loss: 0.4060, Train Acc: 0.9500, Val Loss: 0.0684, Val Acc: 1.0000
Epoch 2, Train Loss: 0.2127, Train Acc: 0.9500, Val Loss: 0.0397, Val Acc: 1.0000
Epoch 3, Train Loss: 0.2073, Train Acc: 0.9500, Val Loss: 0.0503, Val Acc: 1.0000
Epoch 4, Train Loss: 0.2054, Train Acc: 0.9500, Val Loss: 0.0418, Val Acc: 1.0000
Epoch 5, Train Loss: 0.2074, Train Acc: 0.9500, Val Loss: 0.0556, Val Acc: 1.0000
Epoch 6, Train Loss: 0.2102, Train Acc: 0.9500, Val Loss: 0.0459, Val Acc: 1.0000
Epoch 7, Train Loss: 0.2081, Train Acc: 0.9500, Val Loss: 0.0424, Val Acc: 1.0000
Epoch 8, Train Loss: 0.1991, Train Acc: 0.9500, Val Loss: 0.0666, Val Acc: 1.0000
Epoch 9, Train Loss: 0.2104, Train Acc: 0.9500, Val Loss: 0.0775, Val Acc: 1.0000
Epoch 10, Train Loss: 0.2025, Train Acc: 0.9500, Val Loss: 0.0472, Val Acc: 1.0000


# extract features

In [8]:
def extract_features_2d(model, loader):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for data in loader:
            feature = model(data, return_feature=True)  
            features.append(feature.cpu().detach().numpy()) 
            labels.append(data.y.cpu().numpy())
    
    features = np.concatenate(features, axis=0)
    labels = np.concatenate(labels, axis=0)
    return features, labels

In [9]:
model.load_state_dict(torch.load('best2_model.pth'))
all_loader = DataLoader(torch_graph_data_list, batch_size=4, shuffle=False)
all_features, all_labels = extract_features_2d(model, all_loader)

In [10]:
print(all_features.shape)
print(all_labels.shape)

(50, 32)
(50,)
