In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%pip install torch_geometric
%pip install rdkit

In [5]:
from torch_geometric.data import Batch
import torch

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import networkx as nx
import ast

# for molecules
from rdkit import Chem
from rdkit.Chem import Draw

In [7]:
from sklearn.model_selection import train_test_split
from torch_geometric.data import Dataset, DataLoader
from torch_geometric.utils.convert import from_networkx, to_networkx

In [270]:
df = pd.read_csv('molecule_training.csv')
df['toxicity'] = df['target'].apply(lambda x: 'toxic' if x == 1 else 'non-toxic')

In [None]:
df = df.drop(1629) # Drop a row where we could not get features
df = df.reset_index()
df.drop(columns=["index"], inplace=True)
df.head()

In [274]:
# Define a function to convert the graph string to a networkx graph object
def parse_graph(graph_string):
    # Parse the string as a list of tuples
    edges = ast.literal_eval(graph_string)
    # Create a new empty graph
    graph = nx.Graph()
    # Add edges to the graph
    for edge in edges:
        # Get the node labels and edge weight from the tuple
        node1, node2 = edge[0]
        weight = edge[1]
        # Add the edge to the graph
        graph.add_edge(node1, node2, weight=weight)
    return graph

# Convert the graph strings to networkx graph objects
df["Graph"] = df["Graph"].str.replace(";", ",")
df['graph_obj'] = df['Graph'].apply(parse_graph)

# use the molecules libraries
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)

# Extract number of nodes and edges
df['num_nodes'] = df['graph_obj'].apply(lambda x: x.number_of_nodes())
df['num_edges'] = df['graph_obj'].apply(lambda x: x.number_of_edges())

# Create two sub-datasets: one for toxic molecules and one for non-toxic molecules
toxic_df = df[df['target'] == 1]
non_toxic_df = df[df['target'] == 0]

In [380]:
df.loc[df['graph_obj']==None]

Unnamed: 0,level_0,Maximum Degree,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,inchi_key,Graph,smiles,target,toxicity,graph_obj,mol,num_nodes,num_edges


In [348]:
def mat_features(df,num_nodes):
  max_degree=df['Maximum Degree'].values
  min_degree=df['Minimum Degree'].values
  mol_weight=df['Molecular Weight'].values
  mol_h_donors=df['Number of H-Bond Donors'].values
  mol_nb_rings=df['Number of Rings'].values
  mol_nb_bonds=df['Number of Rotatable Bonds'].values
  mol_polar_surface=df['Polar Surface Area'].values
  x=[([max_degree,min_degree,mol_weight,mol_h_donors,mol_nb_rings,mol_nb_bonds,mol_polar_surface])]*num_nodes
  return np.squeeze(np.array(x)) 

In [349]:
X_G, X_test_G, y_G, y_test_G = train_test_split(df, df['target'], test_size=0.2,  stratify=df['target'])
X_train_G, X_val_G, y_train_G, y_val_G = train_test_split(X_G, y_G, test_size=0.2, stratify=y_G)

labels_train= list(y_train_G)
labels_test = list(y_test_G)
labels_val = list(y_val_G)

In [385]:
def assign_label_x(df,df_graph_obj,labels) : 
  
  X_Graph=[]
  true_labels=[]
  for graph in df_graph_obj['graph_obj'] : 
    nb_nodes=graph.number_of_nodes()
    if nb_nodes!=0 : 
      df_row=df.loc[df['graph_obj']==graph]
      label=df_row["target"].values
      graph=from_networkx(graph)
      graph["label"]=int(label)
      true_labels.append(label)
      x=mat_features(df_row,nb_nodes)
      graph['x']=x
      X_Graph.append(graph)
  return X_Graph,true_labels

In [None]:
X_TRAIN_GRAPH, y_train = assign_label_x(df, X_train_G, labels_train)
X_VAL_GRAPH, y_val = assign_label_x(df, X_val_G, labels_val)
X_TEST_GRAPH, y_test = assign_label_x(df, X_test_G, labels_test)

In [362]:
len(y_test)

1490

In [365]:
BATCH_SIZE=1
train_loader=DataLoader(X_TRAIN_GRAPH, batch_size=BATCH_SIZE)
val_loader=DataLoader(X_VAL_GRAPH,batch_size=BATCH_SIZE )
test_loader=DataLoader(X_TEST_GRAPH,batch_size=BATCH_SIZE)



In [352]:
import torch.nn as nn
from torch_geometric.nn import GCNConv
class BasicGraphModel(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.graphconv1 = GCNConv(input_size, hidden_size)
        self.graphconv2 = GCNConv(hidden_size, hidden_size)
        self.graphconv3 = GCNConv(hidden_size, output_size)

        self.elu = nn.ELU()
        self.softmax = nn.Softmax(dim=-1)
        

    def forward(self, x, edge_index):
        x = self.graphconv1(x, edge_index)
        x = self.graphconv2(x, edge_index)
        x = self.elu(x)
        x = self.graphconv3(x, edge_index)
        x = self.softmax(x)
        return x

In [366]:
import torch.nn.functional as F
from sklearn.metrics import f1_score

def train(model, loss_fcn, optimizer, train_dataloader, val_dataloader, num_epochs):
    model = model.double()
    model.train()

    for epoch in range(num_epochs):
        losses = []
        for i, batch in enumerate(train_dataloader):

              x_train=torch.Tensor(batch.x).double()
              edge_ind=batch.edge_index
              output = model(x_train, edge_ind)
              output=output.mean(1)
              labels=batch.label
              loss = loss_fcn(output, labels)
              optimizer.zero_grad()
              loss.backward()
              optimizer.step()
              losses.append(loss.item())

        loss_data = np.mean(losses)
        print("Epoch {} | Loss: {:.4f}".format(epoch, loss_data))
        y_preds_val=test(model, loss_fcn, val_dataloader)


def test(model, loss_fcn, dataloader,):
    scores=0
    y_preds, y_true= [],[]
    mean_scores=0

    #####
    for i, batch in enumerate(dataloader): 
        labels=batch.label
        score, pred = evaluate(model, batch, labels, loss_fcn)
        scores+=score
        y_preds.append(pred)
        y_true.append(int(labels))
   
    mean_scores = scores/(i+1)
    print("Accuracy score: {:.4f}".format(mean_scores))
    print('F1-score: ' , (f1_score(y_preds,y_true)),"\n")
    return y_preds


def evaluate(model, batched_graph, labels, loss_fcn):
    model.eval()
    with torch.no_grad():
        score=0
        x_eval=torch.Tensor(batched_graph.x).double()
        edge_ind=batched_graph.edge_index
        output = model(x_eval, edge_ind)
        output=output.mean(1)
        loss = loss_fcn(output, labels)
        predict = output.argmax(dim=1)
        if int(labels)==int(predict) : 
          score = 1 
    return score, int(predict)

In [354]:
n_features = 7
n_classes = 2
hidden_size = 64

model = BasicGraphModel( 
                        input_size=n_features,
                        hidden_size=hidden_size, 
                        output_size=n_classes
                        )
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

loss_fcn = torch.nn.CrossEntropyLoss()



In [355]:
# Train and test
train(model, loss_fcn, optimizer, train_loader, val_loader, num_epochs=10)

Epoch 0 | Loss: 0.8151
Accuracy score: 0.9589
F1-score:  0.0 

Epoch 1 | Loss: 0.5770
Accuracy score: 0.9589
F1-score:  0.0 

Epoch 2 | Loss: 0.4496
Accuracy score: 0.9581
F1-score:  0.0 

Epoch 3 | Loss: 0.3602
Accuracy score: 0.9581
F1-score:  0.0 

Epoch 4 | Loss: 0.3028
Accuracy score: 0.9572
F1-score:  0.0 

Epoch 5 | Loss: 0.2497
Accuracy score: 0.9581
F1-score:  0.0 

Epoch 6 | Loss: 0.2143
Accuracy score: 0.9581
F1-score:  0.0 

Epoch 7 | Loss: 0.1889
Accuracy score: 0.9564
F1-score:  0.0 

Epoch 8 | Loss: 0.1898
Accuracy score: 0.9581
F1-score:  0.0 

Epoch 9 | Loss: 0.1731
Accuracy score: 0.9581
F1-score:  0.0 



In [364]:
test_loader

<torch_geometric.deprecation.DataLoader at 0x7fe331e7c340>

In [368]:
y_preds_test=test(model, loss_fcn, test_loader)

Accuracy score: 0.9591
F1-score:  0.0 



In [377]:
f1_score(list(y_test),y_preds_test)

0.0

In [None]:
y_preds_test 

In [195]:
len(y_preds_test)

1492

Split equally 0 and 1 

In [339]:
new_X_TRAIN_GRAPH=[]

In [340]:
compteur_1=0
compteur_0=0
for i in range (len(X_TRAIN_GRAPH)) : 
  if X_TRAIN_GRAPH[i]['label']==1 : 
    new_X_TRAIN_GRAPH.append(X_TRAIN_GRAPH[i])
    compteur_1+=1
  else : 
    if compteur_0 !=compteur_1 :
      new_X_TRAIN_GRAPH.append(X_TRAIN_GRAPH[i])
      compteur_0+=1

In [341]:
new_train_loader=DataLoader(new_X_TRAIN_GRAPH, batch_size=BATCH_SIZE)



In [342]:
n_features = 7
n_classes = 2
hidden_size = 64

model = BasicGraphModel( 
                        input_size=n_features,
                        hidden_size=hidden_size, 
                        output_size=n_classes
                        )
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

loss_fcn = torch.nn.CrossEntropyLoss()



In [343]:
# Train and test
train(model, loss_fcn, optimizer ,new_train_loader, val_loader, num_epochs=10)

Epoch 0 | Loss: 7.5102
Accuracy score: 0.5243
F1-score:  0.04705882352941176 

Epoch 1 | Loss: 1.4557
Accuracy score: 0.4153
F1-score:  0.0743691899070385 

Epoch 2 | Loss: 1.0077
Accuracy score: 0.3666
F1-score:  0.08262454434993924 

Epoch 3 | Loss: 0.8682
Accuracy score: 0.3599
F1-score:  0.0970414201183432 



KeyboardInterrupt: ignored

In [231]:
y_preds_test=test(model, loss_fcn, test_loader)

Accuracy score: 0.3633
F1-score:  0.06679764243614932 

