In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import torch
import torch.nn as nn
from torch_geometric.nn import MessagePassing, global_mean_pool, radius_graph
from torch_geometric.nn.models.schnet import GaussianSmearing
from torch_geometric.nn import  CGConv
from matplotlib.offsetbox import AnchoredText
import torch.nn.functional as F


# In[2]:


import  sys
sys.path.append("./")
from AdsorptionSite import *
from help_function import *


# ## The model

# In[3]:



class CGCNN(torch.nn.Module):
    """
        Initialize CrystalGraphConvNet.
        Parameters
        ----------
        dim_node_attr: int
          Number of atom features in the input.
        dim_edge_attr: int
          Number of edge features in the input.
        edges_embedding_size: int
          Number of bond features.
        atom_embedding_size: int
          Number of hidden atom features in the convolutional layers
        num_graph_conv_layers: int
          Number of convolutional layers
        num_fc_layers: int
          Number of hidden layers after pooling
        fc_feat_size (int, optional):
            Size of fully connected layers.
    """
    def __init__(self,
                 dim_node_attr,
                 dim_edge_attr,
                 atom_embedding_size=64,
                 edges_embedding_size=128,
                 fc_feat_size=128,
                 num_graph_conv_layers=6,
                 num_fc_layers=4,
                 batch_norm=False,
                 bias=True):
        super(CGCNN, self).__init__()

        self.embedding_nodes = torch.nn.Linear(dim_node_attr, atom_embedding_size)

        #  Embedding edges features

        self.embedding_edges = torch.nn.Linear(dim_edge_attr, edges_embedding_size)




        self.embedding_fc = torch.nn.Linear(dim_node_attr, atom_embedding_size)

        self.convs = nn.ModuleList(
            [
                CGCNNConv(
                    node_dim=atom_embedding_size,
                    edge_dim=dim_edge_attr,
                )
                for _ in range(num_graph_conv_layers)
            ]
        )

        self.conv_to_fc = nn.Sequential(
            nn.Linear(atom_embedding_size, fc_feat_size), nn.Softplus()
        )

        if num_fc_layers > 1:
            layers = []
            for _ in range(num_fc_layers - 1):
                layers.append(nn.Linear(fc_feat_size, fc_feat_size))
                layers.append(nn.Softplus())
            self.fcs = nn.Sequential(*layers)
        self.fc_out = nn.Linear(fc_feat_size, 1)


    def _forward(self, data):

        # Forward pass through the network
        mol_feats = self._convolve(data)
        mol_feats = self.conv_to_fc(mol_feats)
        if hasattr(self, "fcs"):
            mol_feats = self.fcs(mol_feats)

        out = self.fc_out(mol_feats)
        return  out.view(-1)

    def forward(self, data):

        return self._forward(data)



    def _convolve(self, data):
        """
        Returns the output of the convolution layers before they are passed
        into the dense layers.
        """
        node_feats = self.embedding_nodes(data.x)

#         edge_attr = self.embedding_edges(data.edge_attr)

        edge_attr = data.edge_attr

        for f in self.convs:
            node_feats = f(node_feats, data.edge_index, edge_attr )
        mol_feats = global_mean_pool(node_feats, data.batch)
        return mol_feats



# In[4]:


class CGCNNConv(MessagePassing):
    """Implements the message passing layer from
    `"Crystal Graph Convolutional Neural Networks for an
    Accurate and Interpretable Prediction of Material Properties"
    <https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.145301>`.
    """

    def __init__(self, node_dim, edge_dim,  **kwargs):
        super(CGCNNConv, self).__init__(aggr="add")
        self.node_feat_size = node_dim
        self.edge_feat_size = edge_dim

        self.lin1 = nn.Linear(
            2 * self.node_feat_size + self.edge_feat_size,
            2 * self.node_feat_size,
        )
        self.bn1 = nn.BatchNorm1d(2 * self.node_feat_size)
        self.ln1 = nn.LayerNorm(self.node_feat_size)

        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.lin1.weight)

        self.lin1.bias.data.fill_(0)

        self.bn1.reset_parameters()
        self.ln1.reset_parameters()

    def forward(self, x, edge_index, edge_attr):
        """
        Arguments:
            x has shape [num_nodes, node_feat_size]
            edge_index has shape [2, num_edges]
            edge_attr is [num_edges, edge_feat_size]
        """
        out = self.propagate(
            edge_index, x=x, edge_attr=edge_attr, size=(x.size(0), x.size(0))
        )
        out = nn.Softplus()(self.ln1(out) + x)
        return out

    def message(self, x_i, x_j, edge_attr):
        """
        Arguments:
            x_i has shape [num_edges, node_feat_size]
            x_j has shape [num_edges, node_feat_size]
            edge_attr has shape [num_edges, edge_feat_size]
        Returns:
            tensor of shape [num_edges, node_feat_size]
        """
        z = self.lin1(torch.cat([x_i, x_j, edge_attr], dim=1))
        z = self.bn1(z)
        z1, z2 = z.chunk(2, dim=1)
        z1 = nn.Sigmoid()(z1)
        z2 = nn.Softplus()(z2)
        return z1 * z2


# ## Plot

# In[5]:


def plot_train(df, name='trainplot', title=None, x='Target Value', y='Predicted Value', color=None,  hue='adsorbate'):
    sns.set_style("whitegrid", {'axes.grid' : False})

    actual = df[x].to_numpy()
    pred = df[y].to_numpy()
    RMSD = np.sqrt( sum( (pred - actual)**2 ) / len(pred) )
    MEA = np.mean(np.abs(pred - actual))
    fig, ax = plt.subplots()

    sns.scatterplot(data=df, x=x, y=y, color=color, hue=hue, s=40,  legend='full', linewidth=0, alpha = 0.7)
    sns.lineplot(x=np.linspace(np.amin(actual), np.amax(actual), 100),
             y=np.linspace(np.amin(actual), np.amax(actual), 100), color='red')





    at = AnchoredText(
        '{:<8s} = {:.3f} eV \n {:<8s} = {:.3f} eV'.format('RMSD', RMSD, 'MAE', MEA), prop=dict(size=12), frameon=True, loc='upper left')
    at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
    ax.add_artist(at)

    ax.legend( title='adsorbate', loc='lower right')
    plt.xlabel('$\Delta E_{DFT}$ [eV]', fontsize=15)
    plt.ylabel('$\Delta E_{Predicted}$ [eV]', fontsize=15)
    plt.title(title, fontsize=15)
    fig.savefig(name+'.png', dpi=300, transparent=True, bbox_inches='tight')


# ## Train  Function

# In[6]:


def _train(model, loader, epoch,normalizer, optimizer, device='cpu'):
    model.train()
    loss_all = 0
    losses = AverageMeter()
    for i, data in enumerate(loader):
        target_normed = normalizer.norm(data.y)
        target_normed = target_normed.to(device)
        data = data.to(device)
        optimizer.zero_grad()
        loss = F.mse_loss(model(data), target_normed)
        loss.backward()
#         loss_all += loss.item() * data.num_graphs
        losses.update(loss.data.cpu(), data.y.size(0))
        optimizer.step()
    return losses.avg.item()


# ## Test  Function

# In[7]:


def _test(model, loader, normalizer, device='cpu', test=False):
    model.eval()
    error = 0
    mae_errors = AverageMeter()
#     mae_error=0
    test_preds = []
    test_targets = []
    test_idx =[]
    test_adsorbate = []

#     for i, data in enumerate(tqdm(loader)):
    for i, data in enumerate(loader):
        data = data.to(device)
        pred = model(data)

        pred_denormed = normalizer.denorm(pred.data.cpu())
        mae_error = mae(pred_denormed, data.y)

        mae_errors.update(mae_error, data.y.size(0))
        test_preds.extend(pred_denormed.tolist())
        test_targets.extend(data.y.tolist())
        try:
            test_idx.extend(data.idx.tolist())
        except:
            test_idx.extend(data.idx)
        try:
            test_adsorbate.extend(data.adsorbate)
        except:
            test_adsorbate.extend(data.adsorbate)

    if test:
        my_dict = {"idx": test_idx,
               "adsorbate": test_adsorbate,
               'Target Value': test_targets,
               'Predicted Value': test_preds}
        df = pd.DataFrame(my_dict)
        return mae_errors.avg , df
    else:
        return mae_errors.avg.item()


# In[8]:


def save_checkpoint(state, is_best, filename, checkpoint_dir='.'):

    path = os.path.join(checkpoint_dir, f"{config['model']}_checkpoint")
    torch.save(state, path)
    if is_best:
#         bestfile = datetime.now().strftime('Best_NNConv_%d-%m-%y-%H:%m.pth.tar')
        bestfile = f"{config['model']}_best_model"
        shutil.copyfile(path, os.path.join(checkpoint_dir,bestfile))
        return bestfile


# ## Training

# In[9]:


def train_function(config, train_dataset, val_dataset,normalizer, device = "cpu", checkpoint_dir=None):
    best_model = None
    best_val_error = 10e15


    epoch_history =[]
    val_error_history =[]
    loss_history =[]


    val_loader = DataLoader(val_dataset, batch_size=int(config["batch_size"]), shuffle=True, exclude_keys=['symbol','atomic_number' ,'distance', 'EN' ])
    train_loader = DataLoader(train_dataset, batch_size=int(config["batch_size"]), shuffle=True, exclude_keys=['symbol','atomic_number' ,'distance', 'EN' ])

    dim_node_attr = train_dataset[0].x.shape[-1]
    dim_edge_attr= train_dataset [0].edge_attr.shape[-1]

    net = CGCNN(dim_node_attr=dim_node_attr,
                 dim_edge_attr=dim_edge_attr,
                 atom_embedding_size=config['atom_embedding_size'],
                 num_graph_conv_layers=config['num_graph_conv_layers'],
                 fc_feat_size=config["fc_feat_size"],
                 num_fc_layers=config['num_fc_layers'],
                 batch_norm=config['batch_norm'],
                 bias=config['bias'])


    net.to(device)


    optimizer = torch.optim.Adam(net.parameters(), lr=config["lr"])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                       factor=0.7, patience=5,
                                                       min_lr=0.00001)
    for epoch in range(config['Nepoch']):  # loop over the dataset multiple times
        lr = scheduler.optimizer.param_groups[0]['lr']



        loss = _train(net, train_loader, epoch, normalizer, optimizer, device=device)
        val_error = _test(net, val_loader, normalizer)

        scheduler.step(val_error)


        is_best = val_error < best_val_error

        best_val_error = min(val_error , best_val_error)
        model_state = {
                "epoch": epoch + 1,
                "state_dict": net.state_dict(),
                "best_val_mae": best_val_error,
                "optimizer": optimizer.state_dict(),
                "normalizer": normalizer.state_dict(),
                "criterion": "MSELoss",
                "dim_node_attr":dim_node_attr,
                "dim_edge_attr" : dim_edge_attr,
                "atom_embedding_size": config['atom_embedding_size'],
                "num_fc_layers" :config['num_fc_layers'],
                "num_graph_conv_layers": config['num_graph_conv_layers'],
                "fc_feat_size":config["fc_feat_size"]
        }

        save_checkpoint(model_state, is_best, f"{config['model']}_checkpoint", checkpoint_dir)

        epoch_history.append(epoch)
        val_error_history.append(val_error)
        loss_history.append(loss)
        if epoch%50 ==0:
            print(f'Epoch: {epoch:04d}, LR: {lr:7f}, Loss: {loss:.7f}, '
                  f'Val MAE: {val_error:.7f}')

#

        if is_best:
            best_model = model_state

    my_dict = {"epoch":epoch_history,
                   "loss":loss_history ,
                   "val_error":val_error_history}

    df = pd.DataFrame(my_dict)
    df = pd.DataFrame(my_dict)
    a=config["aggr"]
    s=config["num_graph_conv_layers"]
    l=config["atom_embedding_size"]
    h=config["num_fc_layers"]
    b=config["fc_feat_size"]


    df.to_csv(f"{checkpoint_dir}/{config['model']}_{config['emb']}_train_history_{a}_{s}_{l}_{h}_{b}.csv", index=False)

    return best_model


# In[10]:


def test_best_model(config, test_dataset=None, testfile=None, checkpoint_dir=None, best_model=None, batch_size=8, device="cpu", emb='cgcnn92'):

   #     with open('data/raytune_test_data.pickle', 'rb') as handle:
#         test_dataset = pickle.load(handle)


    if testfile is not None:
        with open(testfile, 'rb') as handle:
                test_dataset= pickle.load(handle)



    if emb != 'cgcnn92':
        atom_features =  get_atom_embedding(emb)

        for idata, data in enumerate(test_dataset):
            node_attr = np.vstack([np.array(atom_features[s]).astype(np.float32) for s in  data.symbol])
            test_dataset[idata].x=torch.from_numpy(node_attr)


    test_dataset = update_edges(test_dataset)

    dim_node_attr =  test_dataset[0].x.shape[-1]
    dim_edge_attr= test_dataset[0].edge_attr.shape[-1]



    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                             exclude_keys=['symbol','atomic_number' ,'distance', 'EN' ])



    sample_target = torch.vstack([ data.y for data in test_dataset])
    normalizer = Normalizer(sample_target)


    if best_model:
        model=best_model

    else:

        best_checkpoint_path = os.path.join(checkpoint_dir, f"{config['model']}_best_model")

        model = torch.load(best_checkpoint_path)





    best_trained_model = CGCNN(dim_node_attr=dim_node_attr,
                 dim_edge_attr=dim_edge_attr,
                 atom_embedding_size=config['atom_embedding_size'],
                 num_graph_conv_layers=config['num_graph_conv_layers'],
                 fc_feat_size=config["fc_feat_size"],
                 num_fc_layers=config['num_fc_layers'],
                 batch_norm=config['batch_norm'],
                 bias=config['bias'])




    best_trained_model.to(device)
    normalizer.load_state_dict(model['normalizer'])
    best_trained_model.load_state_dict(model['state_dict'])



    test_error, test_df  = _test(best_trained_model, test_loader, normalizer,device=device,test=True)



    return  test_error, test_df


# In[11]:


def get_atom_embedding(emb="cgcnn92"):
    elem_emb = join(os.path.dirname(os.path.realpath("__file__")),
                    f"element/{emb}.json")
    with open(elem_emb) as f:
            atom_features = json.load(f)
    return atom_features


def update_edges(dataset):
    for idata, data in enumerate(dataset):
        dataset[idata].edge_attr = torch.stack([1/data.distance[:,0],1/data.EN[:,0]], dim=1).to(torch.float)
    return dataset

def main(config, datafile, data_size=None, campaign_name=None, train_ratio=None, val_ratio=0.1, test_ratio=0.1, emb="cgcnn92"):


    try:
        Path(campaign_name).mkdir(parents=True, exist_ok=False)
    except FileExistsError:
        print(f"Folder [{campaign_name}] is already there")
    else:
        print(f"Folder [{campaign_name}] was created")



    with open(datafile, 'rb') as handle:
        dataset = pickle.load(handle)

#-----------------------------------------------------------

    
#    x = [-2.012450,-2.247770,-2.046270,-2.185180,-2.074820,-2.245050,-1.007220,-2.179560,
#    -2.189040,-2.188344,-2.155040,-2.171900,-2.097850,-1.751420]
#    #print(len(dataset))
#    new = []
#    for i in range(len(dataset)):
#        if dataset[i].y in x:
#            print(i)
#        elif dataset[i].y>-1.6:
#            print(i)
#        else:
#            new.append(dataset[i])
#    dataset = new
    #print(len(dataset),len(new))
    #print(len(x))
#-----------------------------------------------------------


    if data_size is not None:

        dataset = random.sample(dataset,  int(len(dataset)*data_size))

    if emb != 'cgcnn92':
        atom_features =  get_atom_embedding(emb)


        for idata, data in enumerate(dataset):
            node_attr = np.vstack([np.array(atom_features[s]).astype(np.float32) for s in  data.symbol])
            dataset[idata].x=torch.from_numpy(node_attr)


    dataset = update_edges(dataset)



    train_sampler, val_sampler, test_sampler = get_train_val_test_indices(len(dataset),
                                                                          train_ratio=train_ratio,
                                                                          val_ratio=val_ratio,
                                                                          test_ratio=test_ratio)






    dim_node_attr = dataset[0].x.shape[-1]
    dim_edge_attr=dataset[0].edge_attr.shape[-1]

    sample_target = torch.vstack([ data.y for data in dataset])
    normalizer = Normalizer(sample_target)



    train_dataset  = [ dataset[idx] for idx in train_sampler]
    val_dataset = [ dataset[idx] for idx in val_sampler]
    test_dataset   = [ dataset[idx] for idx in test_sampler]

    train_dataset += test_dataset
    print('train_sampler len:', len(train_dataset))
    print('val_sampler len:', len(val_dataset))


    best_model = train_function(config,train_dataset =train_dataset, val_dataset =val_dataset, normalizer=normalizer,checkpoint_dir=campaign_name)

    return best_model, test_dataset


# ### OH

# In[12]:

if __name__ == '__main__':
    datafile = 'CO2RR/CO2RR_Train_cgcnn92.pickle'
    campaign_name = Path(Path(datafile).stem).stem
    config = {'lr': 0.0001 ,
              'aggr': 'add',
              'batch_size':16,
                'Nepoch': 500,
              "atom_embedding_size":64,
              "num_graph_conv_layers":6,
              "num_fc_layers":4,
              "fc_feat_size":128,
              "batch_norm":True,
              "bias":True,
            " momentum": 0.9,
            "weight_decay":5e-4,
            #"emb":"megnet16",
              "emb":"cgcnn92",
            "model":'cgcnn'}


    a=config["aggr"]
    s=config["num_graph_conv_layers"]
    l=config["atom_embedding_size"]
    h=config["num_fc_layers"]
    b=config["fc_feat_size"]


    best_model, test_dataset = main(config, datafile, data_size=None, campaign_name=campaign_name, emb=config['emb'])


    # In[13]:


    train_history = pd.read_csv(f"{campaign_name}/{config['model']}_{config['emb']}_train_history_{a}_{s}_{l}_{h}_{b}.csv")
    dfm = train_history.melt('epoch', var_name='error_type', value_name='error')
    sns.lineplot(data=dfm, x='epoch', y='error',hue='error_type' )
    # sns.lineplot(data=train_history,x='epoch', y='val_error', color='r')
    # sns.lineplot(data=train_history,x='epoch', y='loss', color='b')


    # In[14]:


    testfile = 'CO2RR/CO2RR_Test_cgcnn92.pickle'



    err, df= test_best_model(config, test_dataset, testfile =testfile,  checkpoint_dir=None, best_model=best_model, batch_size=8, device="cpu",emb=config['emb'])
    df.to_csv(f"{campaign_name}/{config['model']}_{campaign_name}_{config['emb']}_{a}_{s}_{l}_{h}_{b}.csv", index=False)




    # In[15]:


    # df = pd.read_csv(f'results/results_OH.csv')
    plot_train(df, name=f"{campaign_name}/{config['model']}_{campaign_name}_{config['emb']}_{a}_{s}_{l}_{h}_{b}",  title ='CO/CHO/COOH Parity Plot', x='Target Value', y='Predicted Value', color='b',  hue='adsorbate')


OSError: dlopen(/Users/zach/opt/anaconda3/envs/ml_course/lib/python3.9/site-packages/torch_sparse/_convert_cpu.so, 6): Symbol not found: __ZN2at8internal13_parallel_runExxxRKNSt3__18functionIFvxxmEEE
  Referenced from: /Users/zach/opt/anaconda3/envs/ml_course/lib/python3.9/site-packages/torch_sparse/_convert_cpu.so
  Expected in: /Users/zach/opt/anaconda3/envs/ml_course/lib/python3.9/site-packages/torch/lib/libtorch_cpu.dylib
 in /Users/zach/opt/anaconda3/envs/ml_course/lib/python3.9/site-packages/torch_sparse/_convert_cpu.so

In [None]:
#CGCNN92 = 0.123 eV MAE
#megnet = 0.116 eV MAE

In [2]:
with open('CO2RR/CO2RR_Test_cgcnn92.pickle', 'rb') as handle:
    dataset = pickle.load(handle)
#with open('CO2RR/CO2RR_Train_cgcnn92.pickle', 'rb') as handle:
#    dataset = pickle.load(handle)
    
train_sample
r, val_sampler, test_sampler = get_train_val_test_indices(len(dataset),train_ratio=None,
                                                                          val_ratio=0.1,
                                                                          test_ratio=0.1)        
train_dataset  = [ dataset[idx] for idx in train_sampler]
val_dataset = [ dataset[idx] for idx in val_sampler]
test_dataset   = [ dataset[idx] for idx in test_sampler]

In [4]:
loader = DataLoader(train_dataset, batch_size=1, shuffle=False,
                             exclude_keys=['symbol','atomic_number' ,'distance', 'EN' ])

for i,data in enumerate(loader):
    #print(data.x.shape)
    p=0