In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Draw
import networkx as nx
from torch_geometric.datasets import MoleculeNet
import matplotlib.pyplot as plt

import torch
import os
import pandas as pd
import numpy as np
import json,pickle
from collections import OrderedDict
from rdkit import Chem
from rdkit.Chem import MolFromSmiles
import networkx as nx

from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric import torch_geometric
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import torch
import torch.nn as nn

import Data_Preprocessing.Graph_Data as gd
from Data_Preprocessing.Graph_Data import Molecule_data
from models.AttentiveFPModel import AttentiveFP

In [2]:
df = pd.read_csv('Data_Preprocessing/omdb_smile_data_set.csv')

In [3]:
smiles = df['SMILE']
codIds = df['CODID']
band_gap = df['bgs']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(smiles, band_gap, test_size=0.2)
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

In [5]:
train_data = Molecule_data(root='data', dataset='train_data_set',y=None,smile_graph=None,smiles=None)

In [6]:
val_data = Molecule_data(root='data', dataset='val_data_set',y=None,smile_graph=None,smiles=None)

In [7]:
test_data = Molecule_data(root='data', dataset='test_data_set',y=None,smile_graph=None,smiles=None)

In [8]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

7314
1057
2101


In [9]:
if torch.cuda.is_available():  
    device = "cuda:6"
    print("cuda:6")
else:  
    device = "cpu" 
    print(torch.cuda.is_available())

cuda:6


In [10]:
# training function at each epoch
def train(model, device, train_loader, optimizer, epoch,loss_fn):
    #print('Training on {} samples...'.format(len(train_loader1.dataset)))
    model.train()
    Loss = []
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x.float(), data.edge_index,data.batch)
        loss = loss_fn(output, data.y.view(-1, 1).float().to(device))
        loss.backward()
        optimizer.step()
        Loss.append(loss.item())
    nploss = np.asarray(Loss)
    avg_loss = np.average(nploss)
    return avg_loss

In [11]:
def predicting(model, device, loader,loss_fn):
    model.eval()
    total_loss=total_example=0
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            output = model(data.x.float(), data.edge_index,data.batch)
            loss = loss_fn(output, data.y.view(-1, 1).float().to(device))
            total_loss+=loss
            total_example+=1
    return total_loss/total_example

In [12]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

In [13]:
#torch.optim.RAdam(params,lr=0.001,betas=(0.9,0.999),eps=1e-08,weight_decay=0)

# Optuna

In [14]:
import optuna
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import math


import logging
import sys


In [15]:
def objective(trial):
    # hyperparameter setting
    
    batch_size = trial.suggest_int('Batch Size', 16, 128)
    
    train_loder   = DataLoader(train_data,batch_size=batch_size,shuffle=False)
    test_loder  = DataLoader(val_data,batch_size=batch_size,shuffle=False)
    
    LR = 1e-4
    LR1 = 1e-8
    loss_fn = nn.L1Loss()
    learning_rate = trial.suggest_uniform('Learning Rate', LR1, LR)
    
    hidden_channel = trial.suggest_int('Hidden Channels', 112, 300)
    
    num_layer = trial.suggest_int('Number of layers', 2, 8)
    
    dropouts = trial.suggest_uniform('DropOut', 0.0, 0.5)
    
  #  num_timestep = trial.suggest_int('Number Of Timestep', 1, 2)
    
 #   eps_rate = trial.suggest_uniform('eps', 1e-8, 1e-4)
    
    weight_decay = trial.suggest_uniform('Weight Decay', 1e-8, 1e-4)
       
    
    model1 = AttentiveFP(in_channels=114, hidden_channels=hidden_channel, out_channels=1,
                    num_layers=num_layer, num_timesteps=2,
                    dropout=dropouts).to(device)
    
    
    optimizer = torch.optim.Adam(model1.parameters(),lr=learning_rate,betas=(0.9,0.999),
                                 weight_decay=weight_decay,amsgrad=False)
    
   # optimizer = torch.optim.Adam(model1.parameters(), lr=learning_rate)
    
    the_last_loss = 100
    patience = 24
    trigger_times = 0
    count_loss_difference = 0

    for epoch in range(1000):
        train_loss=train(model1, device, train_loder, optimizer, epoch+1,loss_fn)
        test_loss = predicting(model1, device, test_loder,loss_fn)
        
        print('Epoch% d: Train mae: %2.5f\t test mae: %2.5f\t'%(epoch, train_loss, test_loss.item()))

        trial.report(test_loss, epoch)
        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            print("Handle pruning based on the intermediate value.")
            raise optuna.TrialPruned()
            
        # Early stopping
        the_current_loss = test_loss.item()

        if the_current_loss > the_last_loss:
            trigger_times += 1
            print('trigger times:', trigger_times)

            if trigger_times >= patience:
                print('Early stopping!\nStart to test process.')
                break
        else:
            trigger_times = 0
            the_last_loss = the_current_loss
    
    y_pred_arr = []
    target_array = []
    for data in test_loder:
        data = data.to(device)
        y_pred = model1(data.x.float(), data.edge_index,data.batch)
        y_pred_arr.append(y_pred.float())
        target_array.append(data.y.float())
        
    y_ped=torch.cat(y_pred_arr).view(-1).cpu().detach().numpy()
    y_true=torch.cat(target_array).cpu().detach().numpy()
    
    error = mean_absolute_error(y_true,y_ped)
    # output: evaluation score
    return error

In [None]:

# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

study = optuna.create_study(direction='minimize',sampler=optuna.samplers.RandomSampler()
                           ,pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=2000)

[32m[I 2022-08-02 11:40:44,901][0m A new study created in memory with name: no-name-64b8502c-9209-4fcb-a4f9-abe480f13608[0m


A new study created in memory with name: no-name-64b8502c-9209-4fcb-a4f9-abe480f13608
Epoch 0: Train mae: 1.20591	 test mae: 0.82855	
Epoch 1: Train mae: 0.80446	 test mae: 0.78791	
Epoch 2: Train mae: 0.77476	 test mae: 0.79976	
trigger times: 1
Epoch 3: Train mae: 0.72574	 test mae: 0.64302	
Epoch 4: Train mae: 0.65857	 test mae: 0.65415	
trigger times: 1
Epoch 5: Train mae: 0.64304	 test mae: 0.68924	
trigger times: 2
Epoch 6: Train mae: 0.63801	 test mae: 0.68583	
trigger times: 3
Epoch 7: Train mae: 0.62332	 test mae: 0.61223	
Epoch 8: Train mae: 0.60969	 test mae: 0.57416	
Epoch 9: Train mae: 0.61107	 test mae: 0.60503	
trigger times: 1
Epoch 10: Train mae: 0.59666	 test mae: 0.65094	
trigger times: 2
Epoch 11: Train mae: 0.59128	 test mae: 0.55266	
Epoch 12: Train mae: 0.58641	 test mae: 0.55992	
trigger times: 1
Epoch 13: Train mae: 0.57104	 test mae: 0.52679	
Epoch 14: Train mae: 0.57292	 test mae: 0.50037	
Epoch 15: Train mae: 0.56052	 test mae: 0.50848	
trigger times: 1
Epoc

Epoch 130: Train mae: 0.40015	 test mae: 0.34916	
trigger times: 13
Epoch 131: Train mae: 0.40462	 test mae: 0.36019	
trigger times: 14
Epoch 132: Train mae: 0.40066	 test mae: 0.35545	
trigger times: 15
Epoch 133: Train mae: 0.40296	 test mae: 0.36431	
trigger times: 16
Epoch 134: Train mae: 0.39704	 test mae: 0.37163	
trigger times: 17
Epoch 135: Train mae: 0.39976	 test mae: 0.36275	
trigger times: 18
Epoch 136: Train mae: 0.39992	 test mae: 0.35545	
trigger times: 19
Epoch 137: Train mae: 0.39804	 test mae: 0.35044	
trigger times: 20
Epoch 138: Train mae: 0.40055	 test mae: 0.34500	
Epoch 139: Train mae: 0.40466	 test mae: 0.35734	
trigger times: 1
Epoch 140: Train mae: 0.39631	 test mae: 0.34321	
Epoch 141: Train mae: 0.39619	 test mae: 0.36239	
trigger times: 1
Epoch 142: Train mae: 0.39985	 test mae: 0.35563	
trigger times: 2
Epoch 143: Train mae: 0.39701	 test mae: 0.36051	
trigger times: 3
Epoch 144: Train mae: 0.39283	 test mae: 0.35320	
trigger times: 4
Epoch 145: Train mae:

Epoch 257: Train mae: 0.35045	 test mae: 0.34559	
trigger times: 19
Epoch 258: Train mae: 0.35444	 test mae: 0.34282	
trigger times: 20
Epoch 259: Train mae: 0.36169	 test mae: 0.31782	
trigger times: 21
Epoch 260: Train mae: 0.35235	 test mae: 0.32148	
trigger times: 22
Epoch 261: Train mae: 0.35346	 test mae: 0.31535	
Epoch 262: Train mae: 0.35662	 test mae: 0.32377	
trigger times: 1
Epoch 263: Train mae: 0.35590	 test mae: 0.32018	
trigger times: 2
Epoch 264: Train mae: 0.35173	 test mae: 0.33396	
trigger times: 3
Epoch 265: Train mae: 0.35270	 test mae: 0.31827	
trigger times: 4
Epoch 266: Train mae: 0.35116	 test mae: 0.32018	
trigger times: 5
Epoch 267: Train mae: 0.34820	 test mae: 0.33809	
trigger times: 6
Epoch 268: Train mae: 0.35559	 test mae: 0.32402	
trigger times: 7
Epoch 269: Train mae: 0.35194	 test mae: 0.32838	
trigger times: 8
Epoch 270: Train mae: 0.34912	 test mae: 0.31995	
trigger times: 9
Epoch 271: Train mae: 0.35170	 test mae: 0.31470	
Epoch 272: Train mae: 0.3

Epoch 383: Train mae: 0.32201	 test mae: 0.29880	
Epoch 384: Train mae: 0.32699	 test mae: 0.29844	
Epoch 385: Train mae: 0.32467	 test mae: 0.30349	
trigger times: 1
Epoch 386: Train mae: 0.32101	 test mae: 0.32008	
trigger times: 2
Epoch 387: Train mae: 0.32190	 test mae: 0.30111	
trigger times: 3
Epoch 388: Train mae: 0.32398	 test mae: 0.30037	
trigger times: 4
Epoch 389: Train mae: 0.32255	 test mae: 0.30474	
trigger times: 5
Epoch 390: Train mae: 0.32342	 test mae: 0.29800	
Epoch 391: Train mae: 0.32477	 test mae: 0.30070	
trigger times: 1
Epoch 392: Train mae: 0.31875	 test mae: 0.30356	
trigger times: 2
Epoch 393: Train mae: 0.31823	 test mae: 0.29787	
Epoch 394: Train mae: 0.31875	 test mae: 0.30346	
trigger times: 1
Epoch 395: Train mae: 0.32053	 test mae: 0.31444	
trigger times: 2
Epoch 396: Train mae: 0.32122	 test mae: 0.30068	
trigger times: 3
Epoch 397: Train mae: 0.32288	 test mae: 0.29878	
trigger times: 4
Epoch 398: Train mae: 0.31780	 test mae: 0.30052	
trigger times

[32m[I 2022-08-02 13:33:30,068][0m Trial 0 finished with value: 0.29315128922462463 and parameters: {'Batch Size': 26, 'Learning Rate': 8.941908545618662e-05, 'Hidden Channels': 183, 'Number of layers': 7, 'DropOut': 0.3450172012987748, 'Weight Decay': 3.6452664447743336e-05}. Best is trial 0 with value: 0.29315128922462463.[0m


Trial 0 finished with value: 0.29315128922462463 and parameters: {'Batch Size': 26, 'Learning Rate': 8.941908545618662e-05, 'Hidden Channels': 183, 'Number of layers': 7, 'DropOut': 0.3450172012987748, 'Weight Decay': 3.6452664447743336e-05}. Best is trial 0 with value: 0.29315128922462463.
Epoch 0: Train mae: 2.43863	 test mae: 1.69632	
Epoch 1: Train mae: 1.39352	 test mae: 0.93538	
Epoch 2: Train mae: 1.02074	 test mae: 0.88171	
Epoch 3: Train mae: 0.92040	 test mae: 0.83871	
Epoch 4: Train mae: 0.86808	 test mae: 0.82292	
Epoch 5: Train mae: 0.84930	 test mae: 0.82145	
Epoch 6: Train mae: 0.84400	 test mae: 0.82852	
trigger times: 1
Epoch 7: Train mae: 0.80603	 test mae: 0.83183	
trigger times: 2
Epoch 8: Train mae: 0.80561	 test mae: 0.81067	
Epoch 9: Train mae: 0.80145	 test mae: 0.81483	
trigger times: 1
Epoch 10: Train mae: 0.79826	 test mae: 0.81030	
Epoch 11: Train mae: 0.80302	 test mae: 0.80217	
Epoch 12: Train mae: 0.79214	 test mae: 0.78959	
Epoch 13: Train mae: 0.79129	 

Epoch 129: Train mae: 0.53932	 test mae: 0.56382	
trigger times: 24
Early stopping!
Start to test process.


[32m[I 2022-08-02 13:44:17,558][0m Trial 1 finished with value: 0.5613318681716919 and parameters: {'Batch Size': 84, 'Learning Rate': 3.842936263440232e-05, 'Hidden Channels': 140, 'Number of layers': 7, 'DropOut': 0.43640565434393513, 'Weight Decay': 5.659969791451963e-05}. Best is trial 0 with value: 0.29315128922462463.[0m


Trial 1 finished with value: 0.5613318681716919 and parameters: {'Batch Size': 84, 'Learning Rate': 3.842936263440232e-05, 'Hidden Channels': 140, 'Number of layers': 7, 'DropOut': 0.43640565434393513, 'Weight Decay': 5.659969791451963e-05}. Best is trial 0 with value: 0.29315128922462463.
Epoch 0: Train mae: 2.12424	 test mae: 1.31979	
Epoch 1: Train mae: 1.04375	 test mae: 0.86698	
Epoch 2: Train mae: 0.91484	 test mae: 0.83098	
Epoch 3: Train mae: 0.87071	 test mae: 0.81251	
Epoch 4: Train mae: 0.84635	 test mae: 0.79219	
Epoch 5: Train mae: 0.82248	 test mae: 0.77971	
Epoch 6: Train mae: 0.81432	 test mae: 0.76723	
Epoch 7: Train mae: 0.79389	 test mae: 0.77502	
trigger times: 1
Epoch 8: Train mae: 0.77602	 test mae: 0.74901	
Epoch 9: Train mae: 0.75995	 test mae: 0.73385	
Epoch 10: Train mae: 0.74426	 test mae: 0.73284	
Epoch 11: Train mae: 0.72266	 test mae: 0.72047	
Epoch 12: Train mae: 0.70750	 test mae: 0.69085	
Epoch 13: Train mae: 0.69290	 test mae: 0.72019	
trigger times: 1

In [None]:
print('Minimum mean squared error: ' + str(study.best_value))
print('Best parameter: ' + str(study.best_params))

# Vizualization

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()
fig.savefig('optunaParms')   # save the figure to file


In [None]:
fig = optuna.visualization.plot_slice(study, params=["Learning Rate", "DropOut"])
fig.show()

In [None]:
trials = study.trials_dataframe()

In [None]:
x = trials['params_Number of layers']
y = trials['value']