# Load libraries and set device up

In [1]:
import matplotlib.pyplot as plt
import numpy             as np
import seaborn           as sns
import os
import torch
import json

from libraries.dataset      import generate_dataset
from torch_geometric.loader import DataLoader

# Checking if pytorch can run in GPU, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

sns.set_theme()

# Define parameters

In [2]:
n_epochs      = 200
batch_size    = 128
learning_rate = 0.001
dropout       = 0.1
patience      = 10
delta         = 2
train_ratio   = 0.8
test_ratio    = 0.1  # val_ratio = 1 - train_ratio - test_ratio

dpi = 50

target = 'GNN'

input_folder  = 'models'
target_folder = f'{input_folder}/{target}'
model_name    = f'{target_folder}/model.pt'

# Generate or load graph database for training

In [None]:
labels_name                 = f'{target_folder}/labels.pt'
dataset_name                = f'{target_folder}/dataset.pt'
dataset_name_std            = f'{target_folder}/standardized_dataset.pt'
labels_name_std             = f'{target_folder}/standardized_labels.pt'
dataset_parameters_name_std = f'{target_folder}/standardized_parameters.json'  # Parameters for rescaling the predictions

if os.path.exists(dataset_name_std) and os.path.exists(dataset_parameters_name_std) and os.path.exists(labels_name_std):
    # Load the standardized dataset, with corresponding labels and parameters
    dataset = torch.load(dataset_name_std)
    labels  = torch.load(labels_name_std)
    
    # Load the data from the JSON file
    with open(dataset_parameters_name_std, 'r') as json_file:
        numpy_dict = json.load(json_file)

    # Convert NumPy arrays back to PyTorch tensors
    # Load the data from the JSON file
    with open(dataset_parameters_name_std, 'r') as json_file:
        numpy_dict = json.load(json_file)
    
    # Convert torch tensors to numpy arrays
    dataset_parameters = {}
    for key, value in numpy_dict.items():
        try:
            dataset_parameters[key] = torch.tensor(value)
        except:
            dataset_parameters[key] = value

elif os.path.exists(dataset_name) and os.path.exists(labels_name):
    # Load the raw dataset, with corresponding labels, and standardize it
    dataset = torch.load(dataset_name)
    labels  = torch.load(labels_name)
    
    # Standardize dataset
    dataset, labels, dataset_parameters = standardize_dataset(dataset, labels,
                                                              transformation='inverse-quadratic')
    
    # Save standardized dataset
    torch.save(dataset, dataset_name_std)
    torch.save(labels,  labels_name_std)
    
    # Convert torch tensors to numpy arrays
    numpy_dict = {}
    for key, value in dataset_parameters.items():
        try:
            numpy_dict[key] = value.cpu().numpy().tolist()
        except:
            numpy_dict[key] = value
    
    # Dump the dictionary with numpy arrays to a JSON file
    with open(dataset_parameters_name_std, 'w') as json_file:
        json.dump(numpy_dict, json_file)

else:
    # Generate data
    generate_dataset('/home/claudio/cibran/Work/UPC/CLUE/Loaded_PhaseTransition',
                     targets=['EPA', 'bandgap'],
                     data_folder=target_folder)

# Defining target factor
target_factor = dataset_parameters['target_std'] / dataset_parameters['scale']

Gd2Ti2O7
	C2-m
LiAs3H2O9
	P2_1-c
VAg2HgO4
	I-42d
Y5Be6Fe3(SiO5)6
	P1
CdSO4
	Cmcm
	Pmn2_1
	P3m1
	C2-m
Cs2MnNiF6
	R3m
Cs3Zr7BCl20
	R-3c
Li2Mn2CoO6
	P-1
	Cmce
KCu2BiS3
	P-1
LiZnInF6
	P321
CaCoF4
	C2-c
YTaP2SO12
	Cc
MgAl3SiBO9
	Pnma
CaZn2(H5O4)2
	P2_1-c
H6PbC2S2O7
	P-1
BaFe4O7
	P6_3mc
W2Br5
	P2_1-c
BaClF
	P4-nmm
MgTiMn3O8
	Cm
	C2-m
Na6Y4Al4Si3(S3O16)3
	P1
LiNi7O7F
	Pm
Na4Ga4Si19
	P1
BaMgFeF7
	P2_1-c
XeO3
	P2_12_12_1
Sn4Te3Se
	R3m
LiSi3H27C9N2
	P-1
Na14Ca4Hf4Si7S5O48
	P1
CsAgCl2
	Cmcm
	P4-nmm
PrTaTiO6
	Pna2_1
CdHg4C6S6(Br2N3)2
	Fmm2
Hg(NCl)2
	Cmmm
Li3Mn(PO4)2
	P-1
	P2_1-c
	P2_1
	P1
	C2-c
Gd3As5O12
	I-43m
SrC3O7
	P2_1-c
Cs10P4PdSe16
	P4_2-mmc
BeAl6O10
	P2_1-c
CsErS2
	R-3m
ZnCrO4
	P2_1-c
	Cmcm
	C2-m
Ba3(AlAs2)2
	Pnma
Li14Nd5Si11N19O7F2
	Pmn2_1
Li3MnV4O8
	C2-m
CoH15N6(ClO)2
	P-1
	Cc
	Pna2_1
Sr2NiSe2(ClO3)2
	P2_1-c
TbHfF7
	P2_1
Na7Mg2Ga2P5SO24
	P1
Na2LaInSi(SO6)2
	P1
La3Se4Cl
	Pna2_1
Fe2CO5
	P2_1-c
Ca(WO2)2
	I4_1-amd
	Cmcm
	Pmmn
	R3m
	Pnma
	P1
Li3Fe8O3F13
	Cm
Ba4Ta2O9
	P2_1-c
	P6_3-m
	P-3c1
LiC

Split dataset

In [None]:
# Define the sizes of the train and test sets
# Corresponds to the size wrt the number of unique materials in the dataset
train_size = int(train_ratio * len(dataset))
test_size  = int(test_ratio  * len(dataset))

np.random.shuffle(dataset)

# Random, fast splitting
train_dataset = dataset[:train_size]
val_dataset   = dataset[train_size:-test_size]
test_dataset  = dataset[-test_size:]

del dataset  # Free up CUDA memory

print(f'Number of training   graphs: {len(train_dataset)}')
print(f'Number of validation graphs: {len(val_dataset)}')
print(f'Number of testing    graphs: {len(test_dataset)}')

Define data loaders.

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=True, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=batch_size, shuffle=True, pin_memory=True)

# Determine number of node-level features in dataset, considering the t_step information
n_node_features = train_dataset[0].num_node_features

del train_dataset, val_dataset, test_dataset  # Free up CUDA memory

# Generate Graph Neural Network model

In [None]:
model = MPL.GCNN(features_channels=n_node_features,
                 pdropout=dropout)

# Moving model to device
model = model.to(device)

if os.path.exists(f'{target_folder}/model.pt'):
    model.load_state_dict(torch.load(f'{target_folder}/model.pt'))
    model.eval()
model

# Train

Define training optimized and criterion 

In [None]:
# MSELoss is by default defined as the mean within the batch
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()

# Initialize early stopping
early_stopping = EarlyStopping(patience=patience, delta=delta, model_name=model_name)

In [None]:
# Train the model
train_losses = []
val_losses   = []
for epoch in np.arange(0, n_epochs):
    train_loss, train_predictions, train_ground_truths = MPL.train(model, criterion, train_loader,
                                                                   target_factor.to(device),
                                                                   dataset_parameters['target_mean'].to(device),
                                                                   optimizer)
    val_loss,   val_predictions,   val_ground_truths   =  MPL.test(model, criterion, val_loader,
                                                                   target_factor.to(device),
                                                                   dataset_parameters['target_mean'].to(device))

    # Pass to energy units (same as initial Fv)
    train_loss = np.sqrt(train_loss) * target_factor.item()
    val_loss   = np.sqrt(val_loss)   * target_factor.item()

    if epoch%5 == 0:
        stack = np.concatenate([train_predictions, train_ground_truths,
                                val_predictions,   val_ground_truths])
    
        _min_ = np.min(stack)
        _max_ = np.max(stack)
    
        plt.figure(figsize=(3, 3))
        plt.plot(train_predictions, train_ground_truths, '.', label='Train')
        plt.plot(val_predictions,   val_ground_truths,   '.', label='Validation')
        plt.xlabel(r'Predicted ')
        plt.ylabel(r'Computed')
        plt.plot([_min_, _max_], [_min_, _max_], '-r')
        plt.legend(loc='best')
        plt.show()
    
    # Append losses
    train_losses.append(train_loss)
    val_losses.append(val_loss)

    # Check early stopping criteria
    early_stopping(val_loss, model)

    if early_stopping.early_stop:
        print('Early stopping')
        break

    print(f'Epoch: {epoch+1}, Train MAE: {train_loss:.4f}, Val MAE: {val_loss:.4f}')

In [None]:
plt.plot(np.log10(train_losses), label='Train loss')
plt.plot(np.log10(val_losses) , label='Val  loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.show()

# Check test data

In [None]:
train_loss, train_predictions, train_ground_truths = MPL.train(model, criterion, train_loader,
                                                               target_factor.to(device),
                                                               dataset_parameters['target_mean'].to(device),
                                                               optimizer)
val_loss,   val_predictions,   val_ground_truths   =  MPL.test(model, criterion, val_loader,
                                                               target_factor.to(device),
                                                               dataset_parameters['target_mean'].to(device),
                                                               )
test_loss,  test_predictions,  test_ground_truths  =  MPL.test(model, criterion, test_loader,
                                                               target_factor.to(device),
                                                               dataset_parameters['target_mean'].to(device),
                                                               )

# Pass to energy units (same as initial Fv)
train_loss = np.sqrt(train_loss) * target_factor.item()
val_loss   = np.sqrt(val_loss)   * target_factor.item()
test_loss  = np.sqrt(test_loss)  * target_factor.item()

stack = np.concatenate([train_predictions, train_ground_truths,
                        val_predictions,   val_ground_truths,
                        test_predictions,  test_ground_truths])

_min_ = np.min(stack)
_max_ = np.max(stack)

plt.figure(figsize=(5, 5))
plt.plot(train_predictions, train_ground_truths, '.', label='Train')
plt.plot(val_predictions,   val_ground_truths,   '.', label='Validation')
plt.plot(test_predictions,  test_ground_truths,  '.', label='Test')
plt.xlabel(r'Predicted $F_v$ (meV/atom)')
plt.ylabel(r'Computed $F_v$ (meV/atom)')
plt.plot([_min_, _max_], [_min_, _max_], '-r')
plt.legend(loc='best')
plt.savefig(f'{target_folder}/GCNN-training.pdf', dpi=dpi, bbox_inches='tight')
plt.show()

print(f'Train MAE: {train_loss:.4f}, Val MAE: {val_loss:.4f}, Test MAE: {test_loss:.4f}')