In [99]:
import numpy as np
import os
import json
from pathlib import Path
import re
from time import sleep
from tqdm import tqdm
import warnings

import torch
from torch_geometric.data import Data, DataLoader
from torch.nn.functional import relu

from pymatgen.io.cif import CifParser
from pymatgen.analysis.local_env import CrystalNN
from pymatgen.core import Structure, Lattice, Site


In [100]:
DATASETS = {
    "Mo": "./data/Mo"
}

In [101]:
def gvector (gvector):
    with open(gvector, "rb") as binary_file:
                bin_version = int.from_bytes(binary_file.read(4),
                                             byteorder='little',
                                             signed=False)
                if bin_version != 0:
                    print("Version not supported!")
                    exit(1)
                # converting to int to avoid handling little/big endian
                flags = int.from_bytes(binary_file.read(2),
                                       byteorder='little',
                                       signed=False)
                n_atoms = int.from_bytes(binary_file.read(4),
                                         byteorder='little',
                                         signed=False)
                g_size = int.from_bytes(binary_file.read(4),
                                        byteorder='little',
                                        signed=False)
                payload = binary_file.read()
                data = np.frombuffer(payload, dtype='<f4')
                en = data[0]
                gvect_size = n_atoms * g_size
                spec_tensor = np.reshape((data[1:1+n_atoms]).astype(np.int32),
                                     [1, n_atoms])
                gvect_tensor = np.reshape(data[1+n_atoms:1+n_atoms+gvect_size],
                                      [n_atoms, g_size])
    return (gvect_tensor)


def json_to_pmg_structure(db_name, json_file):
    """
    converts json files into cif format files
    """
    cif_path = os.path.join(DATASETS[db_name], 
                            "train_gv", "cifs")  
    
    json_path = os.path.join(DATASETS[db_name], 
                            "train_gv", "jsons", json_file) 
    
    Path(cif_path).mkdir(parents=True,
                          exist_ok=True)
    
    json_data = read_json(json_path)
    lattice_vectors = json_data["lattice_vectors"]
    lattice = Lattice(lattice_vectors)

    sites = [
        Site(species=atom[1], coords=atom[2], properties={"occupancy": 1.0})
        for atom in json_data["atoms"]
    ]

    cif_name = json_file.split(".")[0] + ".cif"
    structure = Structure(lattice=lattice, species=["Mo"] * len(sites), coords=[site.coords for site in sites])

    if os.path.isfile(cif_path + "/" + cif_name):
        pass
    else:
        structure.to(filename=cif_path + "/" + cif_name)
    return structure


def get_edge_indexes(structure):
    bonded_structure = CrystalNN(weighted_cn=True, distance_cutoffs=(10,  20.))
    bonded_structure = bonded_structure.get_bonded_structure(structure)
    bonded_structure = bonded_structure.as_dict()
    structure_graph = bonded_structure["graphs"]["adjacency"]

    # len(graph) = number of atoms
    edge_index_from = []
    edge_index_to = []
    edges = []
    for i in range (len(structure_graph)):
        #iterates over the connected atoms of each atom in the cell
        for j in range(len(structure_graph[i])):
            edge_index_from.append(i)
            edge_id = structure_graph[i][j]["id"]
            edge_index_to.append(edge_id)
            edge = torch.tensor(structure_graph[i][j]["to_jimage"])
            edges.append(edge)

    edge_index_from = torch.tensor(edge_index_from)
    edge_index_to = torch.tensor(edge_index_to)

    edge_indexes = np.array([edge_index_from, edge_index_to])
    edge_indexes = torch.from_numpy(edge_indexes)

    edges = np.array(edges)
    edges = torch.from_numpy(edges)
    return edge_indexes, edges


def read_json(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return data


def get_db_keys(db_name):
    db_path = os.path.join(DATASETS[db_name], "train_gv", "gvectors")
    keys = [f.split(".")[0] for f in os.listdir(db_path) if os.path.isfile(os.path.join(db_path, f))]

    gvector_keys = []
    json_keys = []
    for item in keys:
        gvector_keys.append(item+".bin")
        json_keys.append(item+".example")
                  
    return gvector_keys, json_keys

def get_labels():
     """gets labels (energy, force, ...)"""
     label_1 = np.random.uniform(1, 2, size=3965)
     label_2 = np.random.rand(3965, 3)
     label_1 = torch.tensor(label_1, dtype=torch.float)
     label_2 = torch.tensor(label_2, dtype=torch.float)
     return label_1, label_2



def dataset(db_name):
    # Parinello vectors
    db_path =  os.path.join(DATASETS[db_name], "train_gv", "gvectors")
    gvect_keys, json_keys = get_db_keys(db_name)
    set = []
    for item in gvect_keys[0:100]:
        a = gvector (db_path + "/" + item)
        a = torch.tensor(a)
        set.append(a)
    parinello = set

    # edge indexes
    edge_indexes = []
    edges = []

    for item in tqdm(json_keys[0:100]):
        structure = json_to_pmg_structure(db_name="Mo", json_file=item)
        ei, e = get_edge_indexes(structure)
        edge_indexes.append(ei)
        edges.append(e)
         
    return parinello, edge_indexes, edges

In [102]:
warnings.filterwarnings("ignore")
parinello, edge_indexes, edges = dataset(db_name="Mo")

  0%|          | 0/100 [00:00<?, ?it/s]

 12%|█▏        | 12/100 [00:18<02:15,  1.54s/it]


KeyboardInterrupt: 

In [None]:
label_1, label_2 = get_labels()
label_1.shape

torch.Size([3965])

In [None]:
def data(db_name, batch_size):
    """Create a PyTorch Geometric Data object"""
    warnings.filterwarnings("ignore")
    parinello, edge_indexes, edges = dataset(db_name=db_name)

    label_1, label_2 = get_labels()

    dataset = []
    for i in range (len(parinello)):
        data = Data(x=parinello[i], edge_index=edge_indexes[i], to_j=edges[i], y=label_1[i])
        dataset.append(data)

    # Create a PyTorch Geometric DataLoader
    batch_size = batch_size
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return data_loader

In [None]:
dataset = []
for i in range (len(parinello)):
    data = Data(x=parinello[i], edge_index=edge_indexes[i], to_j=edges[i], y=label_1[i])
    #data = Data(x=edges[i],edge_index=edge_indexes[i], y=label_1[i])
    dataset.append(data)

# Create a PyTorch Geometric DataLoader
batch_size = 20
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [92]:
batch = next(iter(data_loader))

batch


DataBatch(x=[137, 160], edge_index=[2, 965], y=[20], to_j=[965, 3], batch=[137], ptr=[21])

In [13]:
import torch_geometric
from torch.nn import Module, MultiheadAttention, Linear, SiLU
from torch_geometric.nn import global_mean_pool, GATConv
from torch import sigmoid
import torch.nn.init as init

In [93]:
class NiceMultiheadAttention(Module):
 
    def __init__(self, in_channels, out_channels, heads):
        super(NiceMultiheadAttention, self).__init__()
        self.lin_k = Linear(in_channels, out_channels)
        self.lin_q = Linear(in_channels, out_channels)
        self.lin_v = Linear(in_channels, out_channels)
        self.att = MultiheadAttention(out_channels, heads, batch_first=True)
    
    def forward(self, h):
        K = self.lin_k(h)
        Q = self.lin_q(h)
        V = self.lin_v(h)
        out, weights = self.att(K[:, None, :], Q[:, None, :], V[:, None, :])
        return out, weights

In [131]:
torch.manual_seed(1)
graph1 = GATConv(in_channels=160, out_channels=16, heads=2)
graph2 = GATConv(in_channels=32, out_channels=8, heads=8)
att1 = NiceMultiheadAttention(64, 8, 2)

readout = Linear(8, 1)

graph_h1 = graph1(batch.x, batch.edge_index)
print("graph_h1", graph_h1.shape)

graph_h1 = relu(graph_h1)
print("graph_h1 after relu", graph_h1.shape)

graph_h2 = graph2(graph_h1, batch.edge_index)
print("graph_h2", graph_h2.shape)

graph_h2 = relu(graph_h2)
print("graph_h2 after relu", graph_h2.shape)

graph_h = global_mean_pool(graph_h2, batch.batch)
print("graph_h", graph_h.shape)

graph_h = relu(graph_h)
print("graph_h after relu", graph_h.shape)

h1 = att1(graph_h)
print("h1", h1[0].shape)
h1 = relu(h1[0])
print("h1", h1.shape)

o = readout(h1[0:])

print("output shape" , o)



graph_h1 torch.Size([137, 32])
graph_h1 after relu torch.Size([137, 32])
graph_h2 torch.Size([137, 64])
graph_h2 after relu torch.Size([137, 64])
graph_h torch.Size([20, 64])
graph_h after relu torch.Size([20, 64])
h1 torch.Size([20, 1, 8])
h1 torch.Size([20, 1, 8])
output shape tensor([[[-0.2471]],

        [[-0.3141]],

        [[-0.3380]],

        [[-0.2980]],

        [[-0.1914]],

        [[-0.3074]],

        [[-0.1482]],

        [[-0.1892]],

        [[-0.3200]],

        [[-0.0338]],

        [[-0.2799]],

        [[-0.2958]],

        [[ 0.2085]],

        [[-0.2499]],

        [[-0.1234]],

        [[-0.3226]],

        [[-0.2652]],

        [[-0.2545]],

        [[-0.2876]],

        [[-0.3337]]], grad_fn=<ViewBackward0>)


In [22]:
graph_h.shape

torch.Size([10, 64])

In [11]:
structure = json_to_pmg_structure(db_name="Mo", json_file="1111.example")


bonded_structure = CrystalNN(weighted_cn=True, distance_cutoffs=(10,  20.))
bonded_structure = bonded_structure.get_bonded_structure(structure)
bonded_structure = bonded_structure.as_dict()
structure_graph = bonded_structure["graphs"]["adjacency"]

#bonded_structure
#len(structure_graph)
structure_graph

[[{'to_jimage': (0, 0, 1), 'weight': 1.0, 'id': 0, 'key': 0},
  {'to_jimage': (0, 1, -1), 'weight': 0.9803019331352788, 'id': 0, 'key': 1},
  {'to_jimage': (1, 0, -1), 'weight': 0.8333345006247604, 'id': 0, 'key': 2},
  {'to_jimage': (1, 1, -1), 'weight': 0.654636307722224, 'id': 0, 'key': 3},
  {'to_jimage': (0, 1, 0), 'weight': 0.4280000036374567, 'id': 0, 'key': 4},
  {'to_jimage': (1, 0, 0), 'weight': 0.3979587118849426, 'id': 0, 'key': 5},
  {'to_jimage': (1, 1, -2), 'weight': 0.19152603496472706, 'id': 0, 'key': 6}]]

In [12]:
import torch_geometric
from torch.nn import Module, MultiheadAttention, Linear
from torch_geometric.nn import global_mean_pool, GATConv
from torch import sigmoid

import torch
from torch_geometric.data import DataLoader
from torch_geometric.nn import global_mean_pool
from torch.optim import Adam
from torch.utils.data import random_split
from torchmetrics import Accuracy
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger


In [57]:
train_dataset = []
for i in range (len(parinello[0:90])):
    data = Data(x=parinello[i], edge_index=edge_indexes[i], to_j=edges[i], y=label_1[i])
    train_dataset.append(data)

# Create a PyTorch Geometric DataLoader
batch_size = 1
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = []
for i in range (len(parinello[90:])):
    data = Data(x=parinello[i], edge_index=edge_indexes[i], to_j=edges[i], y=label_1[i])
    test_dataset.append(data)

# Create a PyTorch Geometric DataLoader
batch_size = 1
val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [107]:
class InContextGNN(pl.LightningModule):
    def __init__(self):
        super(InContextGNN, self).__init__()

        self.graph1 = GATConv(in_channels=160, out_channels=64, heads=8)
        self.graph2 = GATConv(in_channels=512, out_channels=64, heads=8)
        self.att1 = NiceMultiheadAttention(512, 512, 8)
        self.att2 = NiceMultiheadAttention(512, 512, 8)
        self.act = torch.sigmoid
        self.readout = Linear(512, 1)

    def forward(self, batch):
        graph_h1 = self.graph1(batch.x, batch.edge_index)
        graph_h1 = self.act(graph_h1)
        graph_h2 = self.graph2(graph_h1, batch.edge_index)
        graph_h2 = self.act(graph_h2)
        graph_h = global_mean_pool(graph_h2, batch.batch)
        graph_h = self.act(graph_h)
        h1, h1_weights = self.att1(graph_h)
        h1 = self.act(h1)
        h2 = self.att2(h1)
        h2 = self.act(h2[0])
        out = self.readout(h2)
        return out

    def training_step(self, batch, batch_idx):
        output = self(batch)
        loss = torch.nn.functional.mse_loss(output, batch.y.view(-1, 1))
        self.log('train_loss', loss)
        return loss

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.001)

    def validation_step(self, batch):
        output = self(batch)
        loss = torch.nn.functional.mse_loss(output, batch.y.view(-1, 1))
        self.log('val_loss', loss)
        return {'val_loss': loss}

model = InContextGNN()
tensorboard_logger = TensorBoardLogger("logs", name="your_experiment_name")

trainer = pl.Trainer(
    max_epochs=5,
    logger=tensorboard_logger,
    log_every_n_steps=1,
)

trainer.fit(model, train_loader, val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type                   | Params
---------------------------------------------------
0 | graph1  | GATConv                | 83.5 K
1 | graph2  | GATConv                | 263 K 
2 | att1    | NiceMultiheadAttention | 1.8 M 
3 | att2    | NiceMultiheadAttention | 1.8 M 
4 | readout | Linear                 | 513   
---------------------------------------------------
4.0 M     Trainable params
0         Non-trainable params
4.0 M     Total params
16.099    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [105]:
!tensorboard --logdir "logs/your_experiment_name/version_2/"

2349.97s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


^C
2023-11-14 16:44:18.677096: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-11-14 16:44:18.677120: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Traceback (most recent call last):
  File "/home/amirhossein/miniconda3/lib/python3.10/site-packages/tensorboard/compat/__init__.py", line 42, in tf
    from tensorboard.compat import notf  # noqa: F401
ImportError: cannot import name 'notf' from 'tensorboard.compat' (/home/amirhossein/miniconda3/lib/python3.10/site-packages/tensorboard/compat/__init__.py)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/amirhossein/miniconda3/bin/tensorboard", line 8, in <module>
    sys.exit(run_main())
  File "/home/amirhossein/miniconda3/lib/pyth

In [109]:
def get_labels(db_name):
     """gets labels (energy, force, ...)"""
     
     label = []
     db_path =  os.path.join(DATASETS[db_name], "train_gv", "jsons")
     gvect_keys, json_keys = get_db_keys(db_name)
     
     for item in json_keys:
          example = os.path.join(db_path, item)
          
          with open (example, "r") as file:
               data = json.load(file)
          
          label.append(data["energy"][0])
     
     label = torch.tensor(label, dtype=torch.float)

     return label

In [121]:
def data(db_name, batch_size):
    """Create a PyTorch Geometric Data object"""
    warnings.filterwarnings("ignore")
    parinello, edge_indexes, edges = dataset(db_name=db_name)

    labels = get_labels(db_name)

    db = []
    for i in range (len(parinello)):
        data = Data(x=parinello[i], edge_index=edge_indexes[i], to_j=edges[i], y=labels[i])
        db.append(data)

    # Create a PyTorch Geometric DataLoader
    batch_size = batch_size
    dataset_size = len(db)
    train_size = int(0.8 * dataset_size)
    val_size = dataset_size - train_size
    train_dataset, val_dataset = random_split(db, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader

In [119]:
from torch.utils.data import random_split


In [126]:
train_loader, val_loader = data("Mo", 10)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:59<00:00,  1.68it/s]


In [113]:
dataset.shape

AttributeError: 'function' object has no attribute 'shape'

In [128]:
len(train_loader)  

8