In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1
%env CUDA_LAUNCH_BLOCKING=1

import pickle
import os
from bisect import bisect

from torch_geometric.data import DataLoader, InMemoryDataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.tensorboard import SummaryWriter
import numpy as np

# From .py's
from models.dogss import DOGSS
from train_positions import Trainer



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1
env: CUDA_LAUNCH_BLOCKING=1
cuda


In [4]:
class MyOwnDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transfrom=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform=None)
#         self.data, self.slices = torch.load(self.processed_paths[0])
#         print(self.processed_paths[0])
        self.data, self.slices = torch.load('./data_surfaces.pt')

    @property
    def processed_file_names(self):
        return "data_surfaces.pt"

    def _download(self):
        pass

    def _process(self):
        pass


In [5]:
%rm -r ./surface


save_dir = './surface'
if not os.path.exists('surface'):
    os.makedirs(save_dir)
    
log_dir = './surface/log'
if not os.path.exists('./surface/log'):
    os.makedirs(log_dir)

dataset = MyOwnDataset(root='./').shuffle()    
dataset = dataset[:500]

train_size = int(len(dataset)*0.8)
val_size = int(len(dataset)*0.1)
test_size = len(dataset) - train_size - val_size

batch_size=17
n_epoch=200
lr_initial=0.0393415
lr_gamma=0.1
lr_milestones=[100, 150]
warmup_epochs=10
warmup_factor=0.2

In [6]:
assert len(dataset) >= train_size + val_size + test_size

train_dataset = dataset[:train_size]
val_dataset = dataset[train_size : train_size + val_size]
test_dataset = dataset[train_size + val_size : train_size + val_size + test_size]


# torch.save(dataset, os.path.join(save_dir, 'dataset.pt'))
# torch.save(train_dataset, os.path.join(save_dir, 'train_dataset.pt'))
# torch.save(val_dataset, os.path.join(save_dir, 'val_dataset.pt'))
# torch.save(test_dataset, os.path.join(save_dir, 'test_dataset.pt'))

In [7]:
# Load datasets
# dataset = torch.load(os.path.join(save_dir, 'dataset.pt'))
# train_dataset = torch.load(os.path.join(save_dir, 'train_dataset.pt'))
# val_dataset = torch.load(os.path.join(save_dir, 'val_dataset.pt'))
# test_dataset = torch.load(os.path.join(save_dir, 'test_dataset.pt'))

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)
val_loader = DataLoader(
    val_dataset, batch_size=batch_size
)
test_loader = DataLoader(
    test_dataset, batch_size=batch_size
)


In [8]:
class loss():
    def distance_loss(self, pred, true):
        pred = pred[0] if isinstance(pred, tuple) else pred 
        diff = torch.sum((pred-true)**2, dim=1)
#         diff = torch.clamp(diff, min=1e-6)
        return torch.mean(torch.sqrt(diff))

In [9]:
from layers import DOGSSConv


######## 
log_writer = SummaryWriter(log_dir)
#########

conv_layer = DOGSSConv

# Build model
model = DOGSS(
    num_atoms = dataset.data.x.shape[-1],
    bond_feat_dim = dataset.data.edge_attr.shape[-1],
    atom_embedding_size=236,
    num_graph_conv_layers=12,
    num_dist_layers=0,
    num_const_layers =0,
    fc_feat_size=6,
    dist_feat_dim = 4,
    const_feat_dim = 4,
    D_feat_dim = 128,
    max_num_nbr = 12,
    energy_mode = "Harmonic",
    max_opt_steps = 300,
    min_opt_steps = 10,
    opt_step_size = 0.3,
    momentum = 0.8,
    
).to(device)

# criterion = nn.L1Loss()
criterion = loss().distance_loss

optimizer = optim.AdamW(model.parameters(), lr = lr_initial)

def lr_lambda_fun(current_epoch):
    """Returns a learning rate multiplier.
    Till `warmup_epochs`, learning rate linearly increases to `initial_lr`,
    and then gets multiplied by `lr_gamma` every time a milestone is crossed.
    """
    if current_epoch <= warmup_epochs:
        alpha = current_epoch / float(warmup_epochs)
        return warmup_factor * (1.0 - alpha) + alpha
    else:
        idx = bisect(lr_milestones, current_epoch)
        return pow(lr_gamma, idx)

scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda_fun)

In [10]:
trainer = Trainer(
    model = model,
    criterion = criterion,
    optimizer = optimizer,
    scheduler = scheduler,
    train_loader = train_loader,
    val_loader = val_loader,
    test_loader = test_loader,
    device = device,
    normalizer = None,
    log_writer = log_writer,
    checkpoint_dir = save_dir,
)

In [11]:
## To see the Initial Loss as a reference
distances = []
mae = 0
num = 0
c = 0
for count, batch in enumerate(test_loader):
    fixed_base = batch.fixed_base
    free_atom_idx = torch.LongTensor(np.where(fixed_base.cpu() == 0)[0])
    atom_pos = batch.atom_pos[free_atom_idx].cpu().detach()
#     atom_pos = model(batch).cpu().detach()
    y = batch.y.cpu().detach()
    loss = criterion(atom_pos, y)
    n = y.shape[0]
    mae += loss *n
    num += n
    avg = mae/num
#     distances.append(dist)
    c+=1

avg


tensor(0.1332)

In [12]:
### To check if the loss function works correctly

def get_mae(dataset):
    distances = []
    for data in dataset:
        free_atom_idx = np.where(data.fixed_base.cpu() == 0)[0]
        atom_pos = data.atom_pos[free_atom_idx]
        y = data.y
        dist = torch.sqrt(torch.sum((atom_pos-y)**2, dim=1))
        distances.append(dist)
    mae = torch.mean(torch.cat(distances))
    return mae

print(get_mae(test_dataset))

tensor(0.1332, device='cuda:0')


In [13]:
# Train the model
trainer.step(n_epoch=n_epoch)

epoch: 0 	 MAE: 0.8139 	 time: 507.779336
epoch: 1 	 MAE: 0.1374 	 time: 92.901018


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/junwoony/miniconda3/envs/schnet2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-47797b177c13>", line 2, in <module>
    trainer.step(n_epoch=n_epoch)
  File "/home/junwoony/Desktop/baselines/preprocessing/train_positions.py", line 54, in step
    mae_error = self.validate(epoch).cpu().detach()
  File "/home/junwoony/Desktop/baselines/preprocessing/train_positions.py", line 187, in validate
    output = self.model(data).cpu().detach()
  File "/home/junwoony/miniconda3/envs/schnet2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/junwoony/Desktop/baselines/preprocessing/models/dogss.py", line 150, in forward
    grad = torch.autograd.grad(grad_E, atom_pos, retain_graph=True, create_graph=True)[0]
  File "/home/junwoony/miniconda3/envs/s

KeyboardInterrupt: 

In [None]:
torch.cuda.empty_cache()

In [12]:
best_model = torch.load(os.path.join(save_dir, 'model_best.pth.tar'))
model.load_state_dict(best_model['state_dict'])
trainer.validate(0, test=True)

test


tensor(0.0700)