From 963de8cc6d0aee06268fe60ddf5262dee0bc3731 Mon Sep 17 00:00:00 2001 From: Sarah Eisenach Date: Mon, 14 Nov 2022 15:48:50 -0500 Subject: [PATCH 1/2] Adding ability to save model output predictions to csv --- .gitignore | 5 +- .pre-commit-config.yaml | 23 ++++++-- configs/config.yml | 2 +- configs/examples/DOS_STO.yml | 4 +- matdeeplearn/common/registry.py | 27 +++++++++ matdeeplearn/models/base_model.py | 10 +++- matdeeplearn/models/cgcnn.py | 9 ++- matdeeplearn/models/dos_predict.py | 9 ++- matdeeplearn/modules/loss.py | 23 ++++---- matdeeplearn/trainers/base_trainer.py | 52 +++++++++++++---- matdeeplearn/trainers/property_trainer.py | 69 ++++++++++++++++++++--- 11 files changed, 187 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 8c8cceab..8281ec3b 100644 --- a/.gitignore +++ b/.gitignore @@ -163,11 +163,14 @@ dmypy.json .DS_Store # data -./data/* +data/** # config ./config/* +# results +results/** + server/ main.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 414707a9..c444266b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,15 +6,26 @@ repos: - id: end-of-file-fixer - id: trailing-whitespace # isort -- repo: https://github.com/asottile/seed-isort-config - rev: v2.2.0 - hooks: - - id: seed-isort-config -- repo: https://github.com/pre-commit/mirrors-isort - rev: v5.10.1 +#- repo: https://github.com/asottile/seed-isort-config +# rev: v2.2.0 +# hooks: +# - id: seed-isort-config +- repo: https://github.com/pycqa/isort + rev: 5.10.1 hooks: - id: isort args: ["--profile", "black"] +#- repo: https://github.com/pycqa/isort +# rev: 5.8.0 +# hooks: +# - id: isort +# name: isort (python) +# - id: isort +# name: isort (cython) +# types: [cython] +# - id: isort +# name: isort (pyi) +# types: [pyi] # flake8 - repo: https://github.com/pycqa/flake8 rev: 5.0.4 diff --git a/configs/config.yml b/configs/config.yml index 5808db9f..6573ccc4 100644 --- a/configs/config.yml +++ b/configs/config.yml @@ -3,7 +3,7 @@ trainer: property task: # run_mode: train - name: "my_train_job" + identifier: "my_train_job" reprocess: False diff --git a/configs/examples/DOS_STO.yml b/configs/examples/DOS_STO.yml index aec43abb..7972eaae 100644 --- a/configs/examples/DOS_STO.yml +++ b/configs/examples/DOS_STO.yml @@ -1,7 +1,7 @@ trainer: property task: - name: "my_train_job" + identifier: "my_train_job" reprocess: False parallel: True seed: 0 @@ -38,7 +38,7 @@ optim: scheduler_args: {"mode":"min", "factor":0.8, "patience":40, "min_lr":0.00001, "threshold":0.0002} dataset: - processed: False + processed: True src: "/global/cfs/projectdirs/m3641/Shared/Materials_datasets/STO_DOS_data/raw/" target_path: "/global/cfs/projectdirs/m3641/Shared/Materials_datasets/STO_DOS_data/targets.csv" pt_path: "/global/cfs/projectdirs/m3641/Sarah/datasets/processed/STO_DOS_data/" diff --git a/matdeeplearn/common/registry.py b/matdeeplearn/common/registry.py index c6aea7a9..c77f8b41 100644 --- a/matdeeplearn/common/registry.py +++ b/matdeeplearn/common/registry.py @@ -50,6 +50,7 @@ class Registry: "model_name_mapping": {}, "logger_name_mapping": {}, "trainer_name_mapping": {}, + "loss_name_mapping": {}, "state": {}, } @@ -165,6 +166,28 @@ def wrap(func): return wrap + @classmethod + def register_loss(cls, name): + r"""Register a loss class to registry with key 'name' + + Args: + name: Key with which the trainer will be registered. + + Usage:: + + from matdeeplearn.common.registry import registry + + @registry.register_loss("dos_loss") + class DOSLoss(): + ... + """ + + def wrap(func): + cls.mapping["loss_name_mapping"][name] = func + return func + + return wrap + @classmethod def register(cls, name, obj): r"""Register an item to registry with key 'name' @@ -248,6 +271,10 @@ def get_logger_class(cls, name): def get_trainer_class(cls, name): return cls.get_class(name, "trainer_name_mapping") + @classmethod + def get_loss_class(cls, name): + return cls.get_class(name, "loss_name_mapping") + @classmethod def get(cls, name, default=None, no_warning=False): r"""Get an item from registry with key 'name' diff --git a/matdeeplearn/models/base_model.py b/matdeeplearn/models/base_model.py index 2d22e6f2..9fb1054d 100644 --- a/matdeeplearn/models/base_model.py +++ b/matdeeplearn/models/base_model.py @@ -1,9 +1,8 @@ import warnings -from abc import abstractmethod +from abc import ABCMeta, abstractmethod import torch import torch.nn as nn -from torch_geometric.nn import radius_graph from torch_geometric.utils import dense_to_sparse from matdeeplearn.preprocessor.helpers import ( @@ -14,12 +13,17 @@ ) -class BaseModel(nn.Module): +class BaseModel(nn.Module, metaclass=ABCMeta): def __init__(self, edge_steps: int = 50, self_loop: bool = True) -> None: super(BaseModel, self).__init__() self.edge_steps = edge_steps self.self_loop = self_loop + @property + @abstractmethod + def target_attr(self): + """Specifies the target attribute property for writing output to file""" + def __str__(self): # Prints model summary str_representation = "\n" diff --git a/matdeeplearn/models/cgcnn.py b/matdeeplearn/models/cgcnn.py index ccfc6246..b9fa8de9 100644 --- a/matdeeplearn/models/cgcnn.py +++ b/matdeeplearn/models/cgcnn.py @@ -60,7 +60,10 @@ def __init__( self.gc_dim, self.post_fc_dim = dim1, dim1 # Determine output dimension length - self.output_dim = 1 if data[0].y.ndim == 0 else len(data[0].y[0]) + if data[0][self.target_attr].ndim == 0: + self.output_dim = 1 + else: + self.output_dim = len(data[0][self.target_attr][0]) # setup layers self.pre_lin_list = self._setup_pre_gnn_layers() @@ -75,6 +78,10 @@ def __init__( # workaround for doubled dimension by set2set; if late pooling not recommended to use set2set self.lin_out_2 = torch.nn.Linear(self.output_dim * 2, self.output_dim) + @property + def target_attr(self): + return "y" + def _setup_pre_gnn_layers(self): """Sets up pre-GNN dense layers (NOTE: in v0.1 this is always set to 1 layer).""" pre_lin_list = torch.nn.ModuleList() diff --git a/matdeeplearn/models/dos_predict.py b/matdeeplearn/models/dos_predict.py index 9cb93b5e..306a079a 100644 --- a/matdeeplearn/models/dos_predict.py +++ b/matdeeplearn/models/dos_predict.py @@ -46,7 +46,10 @@ def __init__( self.gc_dim, self.post_fc_dim = dim1, dim1 # Determine output dimension length - self.output_dim = 1 if data[0].scaled.ndim == 0 else len(data[0].scaled[0]) + if data[0][self.target_attr].ndim == 0: + self.output_dim = 1 + else: + self.output_dim = len(data[0][self.target_attr][0]) # setup layers self.pre_lin_list = self._setup_pre_gnn_layers() @@ -65,6 +68,10 @@ def __init__( Linear(self.dim2, 1), ) + @property + def target_attr(self): + return "scaled" + def _setup_pre_gnn_layers(self): """Sets up pre-GNN dense layers (NOTE: in v0.1 this is always set to 1 layer).""" pre_lin_list = torch.nn.ModuleList() diff --git a/matdeeplearn/modules/loss.py b/matdeeplearn/modules/loss.py index c4835f43..27a564c5 100644 --- a/matdeeplearn/modules/loss.py +++ b/matdeeplearn/modules/loss.py @@ -4,7 +4,10 @@ from torch import nn from torch_geometric.data import Batch +from matdeeplearn.common.registry import registry + +@registry.register_loss("DOSLoss") class DOSLoss(nn.Module): def __init__( self, @@ -47,27 +50,20 @@ def forward(self, predictions: tuple[torch.Tensor, torch.Tensor], target: Batch) def get_dos_features(self, x, dos): """get dos features""" dos = torch.abs(dos) + dos_sum = torch.sum(dos, axis=1) - center = torch.sum(x * dos, axis=1) / torch.sum(dos, axis=1) + center = torch.sum(x * dos, axis=1) / dos_sum x_offset = ( torch.repeat_interleave(x[np.newaxis, :], dos.shape[0], axis=0) - center[:, None] ) - width = torch.diagonal(torch.mm((x_offset**2), dos.T)) / torch.sum( - dos, axis=1 - ) - skew = ( - torch.diagonal(torch.mm((x_offset**3), dos.T)) - / torch.sum(dos, axis=1) - / width**1.5 - ) + width = torch.diagonal(torch.mm((x_offset**2), dos.T)) / dos_sum + skew = torch.diagonal(torch.mm((x_offset**3), dos.T)) / dos_sum / width**1.5 kurtosis = ( - torch.diagonal(torch.mm((x_offset**4), dos.T)) - / torch.sum(dos, axis=1) - / width**2 + torch.diagonal(torch.mm((x_offset**4), dos.T)) / dos_sum / width**2 ) - # find zero index (fermi leve) + # find zero index (fermi level) zero_index = torch.abs(x - 0).argmin().long() ef_states = torch.sum(dos[:, zero_index - 20 : zero_index + 20], axis=1) * abs( x[0] - x[1] @@ -75,6 +71,7 @@ def get_dos_features(self, x, dos): return torch.stack((center, width, skew, kurtosis, ef_states), axis=1) +@registry.register_loss("TorchLossWrapper") class TorchLossWrapper(nn.Module): def __init__(self, loss_fn="l1_loss"): super().__init__() diff --git a/matdeeplearn/trainers/base_trainer.py b/matdeeplearn/trainers/base_trainer.py index bba966bd..ddef7dfc 100644 --- a/matdeeplearn/trainers/base_trainer.py +++ b/matdeeplearn/trainers/base_trainer.py @@ -1,5 +1,8 @@ +import csv import logging +import os from abc import ABC, abstractmethod +from datetime import datetime import torch import torch.optim as optim @@ -17,7 +20,6 @@ from matdeeplearn.common.registry import registry from matdeeplearn.models.base_model import BaseModel from matdeeplearn.modules.evaluator import Evaluator -from matdeeplearn.modules.loss import * from matdeeplearn.modules.scheduler import LRScheduler @@ -35,6 +37,7 @@ def __init__( test_loader: DataLoader, loss: nn.Module, max_epochs: int, + identifier: str = None, verbosity: int = None, ): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -56,9 +59,19 @@ def __init__( self.step = 0 self.metrics = {} self.epoch_time = None + self.best_val_metric = 1e10 self.evaluator = Evaluator() + self.run_dir = os.getcwd() + + timestamp = torch.tensor(datetime.now().timestamp()).to(self.device) + self.timestamp_id = datetime.fromtimestamp(timestamp.int()).strftime( + "%Y-%m-%d-%H-%M-%S" + ) + if identifier: + self.timestamp_id = f"{self.timestamp_id}-{identifier}" + if self.train_verbosity: logging.info( f"GPU is available: {torch.cuda.is_available()}, Quantity: {torch.cuda.device_count()}" @@ -94,6 +107,7 @@ def from_config(cls, config): loss = cls._load_loss(config["optim"]["loss"]) max_epochs = config["optim"]["max_epochs"] + identifier = config["task"].get("identifier", None) verbosity = config["task"].get("verbosity", None) return cls( @@ -107,6 +121,7 @@ def from_config(cls, config): test_loader=test_loader, loss=loss, max_epochs=max_epochs, + identifier=identifier, verbosity=verbosity, ) @@ -180,15 +195,12 @@ def _load_scheduler(scheduler_config, optimizer): @staticmethod def _load_loss(loss_config): """Loads the loss from either the TorchLossWrapper or custom loss functions in matdeeplearn""" - try: - loss_type = loss_config["loss_type"] - # if there are other params for loss type, include in call - if loss_config.get("loss_args"): - return eval(loss_type)(**loss_config["loss_args"]) - else: - return eval(loss_type)() - except (AttributeError, NameError): - raise NotImplementedError(f"Unknown loss class name: {loss_type}") + loss_cls = registry.get_loss_class(loss_config["loss_type"]) + # if there are other params for loss type, include in call + if loss_config.get("loss_args"): + return loss_cls(**loss_config["loss_args"]) + else: + return loss_cls() @abstractmethod def _load_task(self): @@ -205,3 +217,23 @@ def validate(self): @abstractmethod def predict(self): """Implemented by derived classes.""" + + def save_results(self, output, filename, node_level_predictions=False): + results_path = os.path.join(self.run_dir, "results", self.timestamp_id) + os.makedirs(results_path, exist_ok=True) + filename = os.path.join(results_path, filename) + shape = output.shape + + id_headers = ["structure_id"] + if node_level_predictions: + id_headers += ["node_id"] + num_cols = (shape[1] - len(id_headers)) // 2 + headers = id_headers + ["target"] * num_cols + ["prediction"] * num_cols + + with open(filename, "w") as f: + csvwriter = csv.writer(f) + for i in range(0, len(output)): + if i == 0: + csvwriter.writerow(headers) + elif i > 0: + csvwriter.writerow(output[i - 1, :]) diff --git a/matdeeplearn/trainers/property_trainer.py b/matdeeplearn/trainers/property_trainer.py index 9e85f08c..2ca71a8a 100644 --- a/matdeeplearn/trainers/property_trainer.py +++ b/matdeeplearn/trainers/property_trainer.py @@ -1,6 +1,7 @@ import logging import time +import numpy as np import torch from matdeeplearn.common.registry import registry @@ -22,6 +23,7 @@ def __init__( test_loader, loss, max_epochs, + identifier, verbosity, ): super().__init__( @@ -35,6 +37,7 @@ def __init__( test_loader, loss, max_epochs, + identifier, verbosity, ) @@ -83,17 +86,28 @@ def train(self): if self.val_loader: val_metrics = self.validate() - # save checkpoint if metric is best so far - # if self.val_metrics[self.evaluator.task_primary_metric[self.name]]["metric"] < self.best_val_metric: - # pass - # if it is best and test loader exists, then predict too - # Train loop timings self.epoch_time = time.time() - epoch_start_time # Log metrics if epoch % self.train_verbosity == 0: self._log_metrics(val_metrics) + # update best_val_metric and save predicted outputs for train, test, val + # TODO save checkpoint if metric is best so far + if ( + val_metrics[type(self.loss_fn).__name__]["metric"] + < self.best_val_metric + ): + self.best_val_metric = val_metrics[type(self.loss_fn).__name__][ + "metric" + ] + logging.debug( + f"Saving prediction results for epoch {epoch} to: /results/{self.timestamp_id}/" + ) + self.predict(self.train_loader, "train") + self.predict(self.val_loader, "val") + self.predict(self.test_loader, "test") + # step scheduler, using validation error self._scheduler_step() @@ -116,9 +130,48 @@ def validate(self, split="val"): return metrics - def predict(self): - # TODO: implement predict method - return {} + @torch.no_grad() + def predict(self, loader, split): + # TODO: make predict method work as standalone task + assert isinstance(loader, torch.utils.data.dataloader.DataLoader) + + predict, target = None, None + ids = [] + node_level_predictions = False + for i, batch in enumerate(loader): + out = self._forward(batch.to(self.device)) + + # if out is a tuple, then it's scaled data + if type(out) == tuple: + out = out[0] * out[1].view(-1, 1).expand_as(out[0]) + + batch_p = out.data.cpu().numpy() + batch_t = batch[self.model.target_attr].cpu().numpy() + + batch_ids = np.array( + [item for sublist in batch.structure_id for item in sublist] + ) + + # if shape is 2D, then it has node-level predictions + if batch_p.ndim == 2: + node_level_predictions = True + node_ids = batch.z.cpu().numpy() + structure_ids = np.repeat( + batch_ids, batch.n_atoms.cpu().numpy(), axis=0 + ) + batch_ids = np.column_stack((structure_ids, node_ids)) + + ids = batch_ids if i == 0 else np.row_stack((ids, batch_ids)) + predict = batch_p if i == 0 else np.concatenate((predict, batch_p), axis=0) + target = batch_t if i == 0 else np.concatenate((target, batch_t), axis=0) + + predictions = np.column_stack((ids, target, predict)) + + self.save_results( + predictions, f"{split}_predictions.csv", node_level_predictions + ) + + return predictions def _forward(self, batch_data): output = self.model(batch_data) From ca8bc2faa2a76984ef7f314a5e7216ebbae6b882 Mon Sep 17 00:00:00 2001 From: Sarah Eisenach Date: Sat, 3 Dec 2022 17:32:25 -0500 Subject: [PATCH 2/2] Adding ability to save model checkpoint and best model --- .pre-commit-config.yaml | 15 -------- matdeeplearn/trainers/base_trainer.py | 46 +++++++++++++++++++++++ matdeeplearn/trainers/property_trainer.py | 20 ++++------ 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c444266b..18eea1bf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,26 +6,11 @@ repos: - id: end-of-file-fixer - id: trailing-whitespace # isort -#- repo: https://github.com/asottile/seed-isort-config -# rev: v2.2.0 -# hooks: -# - id: seed-isort-config - repo: https://github.com/pycqa/isort rev: 5.10.1 hooks: - id: isort args: ["--profile", "black"] -#- repo: https://github.com/pycqa/isort -# rev: 5.8.0 -# hooks: -# - id: isort -# name: isort (python) -# - id: isort -# name: isort (cython) -# types: [cython] -# - id: isort -# name: isort (pyi) -# types: [pyi] # flake8 - repo: https://github.com/pycqa/flake8 rev: 5.0.4 diff --git a/matdeeplearn/trainers/base_trainer.py b/matdeeplearn/trainers/base_trainer.py index ddef7dfc..de61656b 100644 --- a/matdeeplearn/trainers/base_trainer.py +++ b/matdeeplearn/trainers/base_trainer.py @@ -1,3 +1,4 @@ +import copy import csv import logging import os @@ -60,6 +61,7 @@ def __init__( self.metrics = {} self.epoch_time = None self.best_val_metric = 1e10 + self.best_model_state = None self.evaluator = Evaluator() @@ -218,6 +220,44 @@ def validate(self): def predict(self): """Implemented by derived classes.""" + def update_best_model(self, val_metrics): + """Updates the best val metric and model, saves the best model, and saves the best model predictions""" + self.best_val_metric = val_metrics[type(self.loss_fn).__name__]["metric"] + self.best_model_state = copy.deepcopy(self.model.state_dict()) + + self.save_model("best_checkpoint.pt", val_metrics, False) + + logging.debug( + f"Saving prediction results for epoch {self.epoch} to: /results/{self.timestamp_id}/" + ) + self.predict(self.train_loader, "train") + self.predict(self.val_loader, "val") + self.predict(self.test_loader, "test") + + def save_model(self, checkpoint_file, val_metrics=None, training_state=True): + """Saves the model state dict""" + + if training_state: + state = { + "epoch": self.epoch, + "step": self.step, + "state_dict": self.model.state_dict(), + "optimizer": self.optimizer.state_dict(), + "scheduler": self.scheduler.scheduler.state_dict(), + "best_val_metric": self.best_val_metric, + } + else: + state = {"state_dict": self.model.state_dict(), "val_metrics": val_metrics} + + checkpoint_dir = os.path.join( + self.run_dir, "results", self.timestamp_id, "checkpoint" + ) + os.makedirs(checkpoint_dir, exist_ok=True) + filename = os.path.join(checkpoint_dir, checkpoint_file) + + torch.save(state, filename) + return filename + def save_results(self, output, filename, node_level_predictions=False): results_path = os.path.join(self.run_dir, "results", self.timestamp_id) os.makedirs(results_path, exist_ok=True) @@ -237,3 +277,9 @@ def save_results(self, output, filename, node_level_predictions=False): csvwriter.writerow(headers) elif i > 0: csvwriter.writerow(output[i - 1, :]) + return filename + + def load_checkpoint(self): + """Loads the model from a checkpoint.pt file""" + # TODO: implement this method + pass diff --git a/matdeeplearn/trainers/property_trainer.py b/matdeeplearn/trainers/property_trainer.py index 2ca71a8a..be510ee6 100644 --- a/matdeeplearn/trainers/property_trainer.py +++ b/matdeeplearn/trainers/property_trainer.py @@ -81,8 +81,11 @@ def train(self): _metrics = self._compute_metrics(out, batch, _metrics) self.metrics = self.evaluator.update("loss", loss.item(), _metrics) + # TODO: could add param to eval and save on increments instead of every time + # Save current model + self.save_model(checkpoint_file="checkpoint.pt", training_state=True) + # Evaluate on validation set if it exists - # TODO: could add param to eval on increments instead of every time if self.val_loader: val_metrics = self.validate() @@ -92,25 +95,18 @@ def train(self): if epoch % self.train_verbosity == 0: self._log_metrics(val_metrics) - # update best_val_metric and save predicted outputs for train, test, val - # TODO save checkpoint if metric is best so far + # Update best val metric and model, and save best model and predicted outputs if ( val_metrics[type(self.loss_fn).__name__]["metric"] < self.best_val_metric ): - self.best_val_metric = val_metrics[type(self.loss_fn).__name__][ - "metric" - ] - logging.debug( - f"Saving prediction results for epoch {epoch} to: /results/{self.timestamp_id}/" - ) - self.predict(self.train_loader, "train") - self.predict(self.val_loader, "val") - self.predict(self.test_loader, "test") + self.update_best_model(val_metrics) # step scheduler, using validation error self._scheduler_step() + return self.best_model_state + def validate(self, split="val"): self.model.eval() evaluator, metrics = Evaluator(), {}