In [6]:
import datetime
import os
import sys
import yaml

import torch

In [None]:
# TODO list:
# 
# [x] load config, make it editable
# [ ] set up matplotlib-friendly logger
# [x] load task
# [x] load model
# [x] load criterion
# [x] load optimizer
# [x] load extras like scheduler, etc.
# [x] train model
# [ ] plot loss curves

In [2]:
ROOT_DIR = "../" # path to the base of the repository.
sys.path.insert(0, ROOT_DIR)

## Load base config, set parameters, create directories

In [24]:
base_config_yaml = os.path.join(ROOT_DIR, "configs/xie_grossman_mat_proj/cgcnn.yml")
config = yaml.safe_load(open(base_config_yaml, "r"))

includes = config.get("includes", [])
for include in includes:
    include_config = yaml.safe_load(open(os.path.join(ROOT_DIR, include), "r"))
    config.update(include_config)

config.pop("includes")

['configs/xie_grossman_mat_proj/base.yml']

In [25]:
timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

# Specify which configuration parameters to override.
config_override = {
    "optim": {
        "batch_size": 64,
        "lr_initial": 0.001,
        "max_epochs": 50,
        "lr_milestones": [100, 150],
        "lr_gamma": 0.1,
        "warmup_epochs": 10,
        "warmup_factor": 0.2
    },
    "dataset": {
        "src": "../data/data/xie_grossman_mat_proj",
        "train_size": 28000,
        "val_size": 2900,
        "test_size": 3000,
    },
    # the following parameters are necessary.
    "cmd": {
        "seed": 1,
        "checkpoint_dir": os.path.join(ROOT_DIR, "checkpoints", timestamp),
        "results_dir": os.path.join(ROOT_DIR, "results", timestamp),
        "logs_dir": os.path.join(ROOT_DIR, "logs", config["logger"], timestamp),
        "print_every": 100,
    }
}

# Update config.
config.update(config_override)

# Create directories.
os.makedirs(os.path.join(ROOT_DIR, "checkpoints", timestamp))
os.makedirs(os.path.join(ROOT_DIR, "results", timestamp))
os.makedirs(os.path.join(ROOT_DIR, "logs", config["logger"], timestamp))

# Print overall config.
print(yaml.dump(config, default_flow_style=False))

cmd:
  checkpoint_dir: ../checkpoints/2020-03-16-00-23-43
  logs_dir: ../logs/wandb/2020-03-16-00-23-43
  print_every: 100
  results_dir: ../results/2020-03-16-00-23-43
  seed: 1
dataset:
  src: ../data/data/xie_grossman_mat_proj
  test_size: 3000
  train_size: 28000
  val_size: 2900
logger: wandb
model: cgcnn
model_attributes:
  atom_embedding_size: 64
  fc_feat_size: 128
  num_fc_layers: 4
  num_graph_conv_layers: 6
optim:
  batch_size: 64
  lr_gamma: 0.1
  lr_initial: 0.001
  lr_milestones:
  - 100
  - 150
  max_epochs: 50
  warmup_epochs: 10
  warmup_factor: 0.2
task:
  dataset: xie_grossman_mat_proj
  description: Formation energy per atom regression on the Materials Project dataset
    from Xie and Grossman.
  labels:
  - formation energy per atom
  metric: mae
  type: regression



## Load task, dataset, model, criterion, optimizer

In [26]:
from ocpmodels.trainers import BaseTrainer

trainer = BaseTrainer(args=None)

trainer.config = config
trainer.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trainer.is_debug = False
trainer.is_vis = False
trainer.logger = None
trainer.load_seed_from_config()

In [27]:
trainer.load_task()
trainer.load_model()
trainer.load_criterion()
trainer.load_optimizer()
trainer.load_extras()

### Loading dataset: xie_grossman_mat_proj
### Loading model: cgcnn
### Loaded CGCNN with 196801 parameters.


In [28]:
print(trainer.model)

CGCNN(
  (embedding): Linear(in_features=92, out_features=64, bias=True)
  (convs): ModuleList(
    (0): CGCNNConv(
      (fc_pre): Sequential(
        (0): Linear(in_features=169, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (fc_post): Sequential(
        (0): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (1): CGCNNConv(
      (fc_pre): Sequential(
        (0): Linear(in_features=169, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (fc_post): Sequential(
        (0): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): CGCNNConv(
      (fc_pre): Sequential(
        (0): Linear(in_features=169, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stat

## Train

In [29]:
trainer.train()

epoch: 0.0023, loss: 0.8411, formation energy per atom/mae: 0.9031
epoch: 0.2306, loss: 0.4904, formation energy per atom/mae: 0.5265
epoch: 0.4589, loss: 0.3692, formation energy per atom/mae: 0.3964
epoch: 0.6872, loss: 0.3193, formation energy per atom/mae: 0.3428
epoch: 0.9155, loss: 0.2897, formation energy per atom/mae: 0.3111
### Evaluating on val.
loss: 0.2184, formation energy per atom/mae: 0.2345
### Evaluating on test.
loss: 0.2166, formation energy per atom/mae: 0.2325
epoch: 1.0023, loss: 0.2816, formation energy per atom/mae: 0.3024
epoch: 1.2306, loss: 0.2661, formation energy per atom/mae: 0.2857
epoch: 1.4589, loss: 0.2553, formation energy per atom/mae: 0.2741
epoch: 1.6872, loss: 0.2469, formation energy per atom/mae: 0.2651
epoch: 1.9155, loss: 0.2391, formation energy per atom/mae: 0.2567
### Evaluating on val.
loss: 0.1230, formation energy per atom/mae: 0.1321
### Evaluating on test.
loss: 0.1204, formation energy per atom/mae: 0.1293
epoch: 2.0023, loss: 0.2365,

loss: 0.0885, formation energy per atom/mae: 0.0950
epoch: 17.0023, loss: 0.1398, formation energy per atom/mae: 0.1501
epoch: 17.2306, loss: 0.1392, formation energy per atom/mae: 0.1494
epoch: 17.4589, loss: 0.1386, formation energy per atom/mae: 0.1488
epoch: 17.6872, loss: 0.1381, formation energy per atom/mae: 0.1483
epoch: 17.9155, loss: 0.1375, formation energy per atom/mae: 0.1476
### Evaluating on val.
loss: 0.0958, formation energy per atom/mae: 0.1029
### Evaluating on test.
loss: 0.0946, formation energy per atom/mae: 0.1016
epoch: 18.0023, loss: 0.1373, formation energy per atom/mae: 0.1474
epoch: 18.2306, loss: 0.1367, formation energy per atom/mae: 0.1468
epoch: 18.4589, loss: 0.1361, formation energy per atom/mae: 0.1461
epoch: 18.6872, loss: 0.1355, formation energy per atom/mae: 0.1455
epoch: 18.9155, loss: 0.1350, formation energy per atom/mae: 0.1449
### Evaluating on val.
loss: 0.0852, formation energy per atom/mae: 0.0915
### Evaluating on test.
loss: 0.0810, form

KeyboardInterrupt: 