Notebook for generating the samples for the GNN model.
For each dataset, the final output is the Dataset.dat file needed by the GNN model.

In [1]:
from pathlib import Path
from collections import namedtuple
import networkx as nx
from pyRDTP.geomio import file_to_mol
from pyRDTP.molecule import Molecule
from functions import *
from paths import *

# Create raw dataset

In [15]:
voronoi_tolerance = 0.25

## Santiago's datasets

### Sensitivity analysis on Voronoi algorithm tolerance

In [2]:
tol_list = [0.25, 0.5, 0.75, 1.0]

for i in tol_list:
    voronoi_tolerance = i
    bad_samples = 0
    tot_samples = 0
    for dataset in santiago_datasets:
        my_tuple = get_tuples(dataset, voronoi_tolerance)
        export_tuples(paths[dataset]['dataset'], my_tuple)
        x = geometry_to_graph_analysis(dataset)
        if dataset[:3] != "gas":
            bad_samples += x[0]
            tot_samples += x[2]
    print("------------------------------------")
    print("Voronoi tolerance: {}".format(i))
    print("Bad samples: {}".format(bad_samples))
    print("Total samples: {}".format(tot_samples))
    print("Bad conversion percentage: {}%".format(bad_samples*100/tot_samples))
    print("------------------------------------")

KeyboardInterrupt: 

### Generate Dataset.dat

In [16]:
bad_samples = 0
tot_samples = 0
for dataset in santiago_datasets:
    my_tuple = get_tuples(dataset, voronoi_tolerance)
    export_tuples(paths[dataset]['dataset'], my_tuple)
    x = geometry_to_graph_analysis(dataset)
    if dataset[:3] != "gas":
        bad_samples += x[0]
        tot_samples += x[2]
print("Voronoi tolerance: {}".format(voronoi_tolerance))
print("Bad samples: {}".format(bad_samples))
print("Total samples: {}".format(tot_samples))
print("Percentage of bad samples: {}%".format(bad_samples * 100/tot_samples))

Dataset Name: amides
Dataset Size: 180
Number of badly represented data: 3
Percentage of bad representations: 1.67%
-------------------------------------------
Dataset Name: amidines
Dataset Size: 468
Number of badly represented data: 0
Percentage of bad representations: 0.00%
-------------------------------------------
Dataset Name: group2
Dataset Size: 326
Number of badly represented data: 83
Percentage of bad representations: 25.46%
-------------------------------------------
Dataset Name: group2b
Dataset Size: 60
Number of badly represented data: 16
Percentage of bad representations: 26.67%
-------------------------------------------
Dataset Name: group3S
Dataset Size: 228
Number of badly represented data: 0
Percentage of bad representations: 0.00%
-------------------------------------------
Dataset Name: group3N
Dataset Size: 384
Number of badly represented data: 0
Percentage of bad representations: 0.00%
-------------------------------------------
Dataset Name: group4
Dataset Siz

## Intermediates

In [17]:
with open(paths['Intermediates']['ener'], 'r') as infile:
    lines = infile.readlines()

ener_dict = {}
for line in lines:
    split = line.split()
    if len(split) != 2 or len(split[0]) != 7:
        continue
    ener_dict[split[0]] = float(split[1])
    
mol_dict = {}
for contcar in paths['Intermediates']['geom'].glob('./*.contcar'):
    mol_dict[contcar.stem] = file_to_mol(contcar, 'contcar', bulk=False)

In [18]:
Intermediates = namedtuple('Intermediates', ['code', 'mol', 'graph', 'energy'])
surf_ener = ener_dict['i000000']  # Cu slab energy

Intermediates_dict = {}
for key, mol in mol_dict.items():
    if key == "i000000":
        continue
    code = key[1:]
    try:
        energy = ener_dict[key] - surf_ener
    except KeyError:
        print(f'{key} not found')
        continue
    try:
        mol = mol_to_ensemble(mol, voronoi_tolerance) 
        graph = ensemble_to_graph(mol)  
    except ValueError:
        print(f'{key} not converting to graph')
        continue

    Intermediates_dict[code] = Intermediates(code=code, mol=mol, graph=graph, energy=energy)

i112101 not found


In [19]:
export_tuples(paths['Intermediates']['dataset'], Intermediates_dict)
geometry_to_graph_analysis("Intermediates")

Dataset Name: Intermediates
Dataset Size: 880
Number of badly represented data: 31
Percentage of bad representations: 3.52%
-------------------------------------------


(31,
 ['34110f',
  '4a1101',
  '123102',
  '362106',
  '141101',
  '362109',
  '361108',
  '481209',
  '361202',
  '371203',
  '373101',
  '461101',
  '48110a',
  '140101',
  '44210f',
  '010101',
  '382101',
  '482101',
  '381201',
  '380101',
  '260101',
  '113101',
  '021101',
  '461123',
  '4a1201',
  '242105',
  '34220f',
  '442109',
  '302201',
  '481106',
  '372104'],
 880)

## RPCA

In [20]:
with open(paths['RPCA']['ener'], 'r') as infile:
    lines = infile.readlines()
ener_dict = {}
for line in lines:
    split = line.split()
    if len(split) != 2:
        continue
    ener_dict[split[0]] = float(split[1])
    
mol_dict = {}
for contcar in paths['RPCA']['geom'].glob('./*.contcar'):
    mol_dict[contcar.stem] = file_to_mol(contcar, 'contcar', bulk=False)

In [21]:
RPCA = namedtuple('RPCA', ['code', 'mol', 'graph', 'energy'])
surf_ener = {}
for key, value in ener_dict.items():
    if 'Cu' in key:
        continue    # Already included in Intermediates
    if '0000' in key:
        elem, _ = key.split('_')
        surf_ener[elem] = float(value)

RPCA_dict = {}
for key, mol in mol_dict.items():
    if 'Cu' in key:
        continue
    if '0000' in key:
        continue
    elem, _ = key.split('_')
    try:
        energy = ener_dict[key] - surf_ener[elem]
    except KeyError:
        print(f'{key} not found')
        continue
    try:
        mol = mol_to_ensemble(mol, voronoi_tolerance)
        graph = ensemble_to_graph(mol)
    except ValueError:
        print(f'{key} not converting to graph')
        continue

    RPCA_dict[key] = RPCA(code=key, mol=mol, graph=graph, energy=energy)

In [22]:
export_tuples(paths['RPCA']['dataset'], RPCA_dict)
geometry_to_graph_analysis("RPCA")

Dataset Name: RPCA
Dataset Size: 853
Number of badly represented data: 21
Percentage of bad representations: 2.46%
-------------------------------------------


(21,
 ['Rh_2601',
  'Pt_2601',
  'Pt_2621',
  'Ru_1401',
  'Pt_1401',
  'Ru_2601',
  'Pt_2611',
  'Pd_1401',
  'Pd_0211',
  'Ir_2601',
  'Rh_1401',
  'Ni_1401',
  'Os_2601',
  'Pd_2601',
  'Pt_1411',
  'Zn_1401',
  'Os_1401',
  'Pt_0211',
  'Ir_1401',
  'Ni_2601',
  'Pd_2611'],
 853)

## Benson

In [23]:
with open(paths['Benson']['ener'], 'r') as infile:
    lines = infile.readlines()

ener_dict = {}
for line in lines:
    split = line.split()
    if len(split) != 2:
        continue
    ener_dict[split[0]] = float(split[1])
    
mol_dict = {}
for contcar in paths['Benson']['geom'].glob('./*.contcar'):
    mol_dict[contcar.stem] = file_to_mol(contcar, 'contcar', bulk=False)

In [24]:
Benson = namedtuple('Benson', ['code', 'mol', 'graph', 'energy'])
surf_ener = {}
for key, value in ener_dict.items():
    if 'surf' in key:
        elem, _ = key.split('_')
        surf_ener[elem] = float(value)

Benson_dict = {}
for key, mol in mol_dict.items():
    elem, _ = key.split('_')
    try:
        energy = ener_dict[key] - surf_ener[elem]
    except KeyError:
        print(f'{key} not found')
        continue
    try:
        mol = mol_to_ensemble(mol, voronoi_tolerance)
        graph = ensemble_to_graph(mol)
    except ValueError:
        print(f'{key} not converting to graph')
        continue

    Benson_dict[key] = Benson(code=key, mol=mol, graph=graph, energy=energy)

In [25]:
export_tuples(paths['Benson']['dataset'], Benson_dict)
geometry_to_graph_analysis("Benson")

Dataset Name: Benson
Dataset Size: 216
Number of badly represented data: 65
Percentage of bad representations: 30.09%
-------------------------------------------


(65,
 ['Cu_mol25',
  'Cu_mol31',
  'Cu_mol21',
  'Cu_mol13',
  'Cu_mol27',
  'Ni_mol41',
  'Cu_mol53',
  'Cu_mol12',
  'Cu_mol47',
  'Au_mol3',
  'Cu_mol34',
  'Cu_mol2',
  'Ni_mol13',
  'Cu_mol18',
  'Cu_mol1',
  'Au_mol2',
  'Cu_mol52',
  'Cu_mol32',
  'Cu_mol33',
  'Cu_mol24',
  'Ni_mol3',
  'Ni_mol2',
  'Cu_mol26',
  'Cu_mol6',
  'Cu_mol37',
  'Ni_mol35',
  'Cu_mol40',
  'Ni_mol1',
  'Cu_mol19',
  'Cu_mol50',
  'Cu_mol39',
  'Ni_mol47',
  'Cu_mol11',
  'Cu_mol44',
  'Cu_mol49',
  'Cu_mol46',
  'Cu_mol20',
  'Ni_mol38',
  'Cu_mol35',
  'Cu_mol3',
  'Ag_mol3',
  'Ni_mol40',
  'Cu_mol17',
  'Cu_mol45',
  'Ni_mol50',
  'Cu_mol15',
  'Cu_mol4',
  'Ni_mol37',
  'Cu_mol29',
  'Cu_mol9',
  'Cu_mol23',
  'Ni_mol10',
  'Cu_mol10',
  'Cu_mol28',
  'Cu_mol54',
  'Cu_mol5',
  'Cu_mol22',
  'Ni_mol12',
  'Cu_mol41',
  'Cu_mol36',
  'Cu_mol38',
  'Ni_mol9',
  'Cu_mol30',
  'Ag_mol2',
  'Cu_mol7'],
 216)

## Alloys

In [26]:
with open(paths['Alloys']['ener'], 'r') as infile:
    lines = infile.readlines()
ener_dict = {}
for line in lines:
    split = line.split()
    if len(split) != 2:
        continue
    ener_dict[split[0]] = float(split[1])
    
mol_dict = {}
for contcar in paths['Alloys']['geom'].glob('./*.contcar'):
    mol_dict[contcar.stem] = file_to_mol(contcar, 'contcar', bulk=False)

In [27]:
Alloys = namedtuple('Alloys', ['code', 'mol', 'graph', 'energy'])
surf_ener = {}
for key, value in ener_dict.items():
    if '0000' in key:
        setn, elem, _ = key.split('-')
        surf_ener[f'{setn}-{elem}'] = float(value)

Alloys_dict = {}
for key, mol in mol_dict.items():
    setn, elem, _ = key.split('-')
    try:
        energy = ener_dict[key] - surf_ener[f'{setn}-{elem}']
    except KeyError:
        print(f'{key} not found')
        continue
    try:
        mol = mol_to_ensemble(mol, voronoi_tolerance)
        graph = ensemble_to_graph(mol)
    except ValueError:
        print(f'{key} not converting to graph')
        continue

    Alloys_dict[key] = Alloys(code=key, mol=mol, graph=graph, energy=energy)

sa-agir-0011 not found
ol-oszn-0000 not converting to graph
sa-nizn-0000 not converting to graph
sa-auos-0011 not found
ss-agau-0000 not converting to graph
ss-znpt-0000 not converting to graph
sa-curh-0000 not converting to graph
ss-irre-0000 not converting to graph
sa-znru-0000 not converting to graph
sa-ospd-0000 not converting to graph
sa-ptzn-0000 not converting to graph
ol-irag-0000 not converting to graph
sa-ptcd-0000 not converting to graph
sa-agir-0000 not converting to graph
ol-rure-0000 not converting to graph
sa-znni-0000 not converting to graph
ol-znir-0000 not converting to graph
sa-rure-0000 not converting to graph
ol-pdfe-0000 not converting to graph
sa-cdag-0000 not converting to graph
ss-cdag-0000 not converting to graph
sa-nipt-0000 not converting to graph
sa-rhcu-0000 not converting to graph
ss-ruco-0000 not converting to graph
ol-rurh-0000 not converting to graph
sa-rhco-0000 not converting to graph
ss-nipd-0000 not converting to graph
sa-rhni-0000 not converting t

ValueError: not enough values to unpack (expected 3, got 1)

In [28]:
export_tuples(paths['Alloys']['dataset'], Alloys_dict)
geometry_to_graph_analysis("Alloys")

Dataset Name: Alloys
Dataset Size: 617
Number of badly represented data: 11
Percentage of bad representations: 1.78%
-------------------------------------------


(11,
 ['ss-ruco-1401',
  'ol-rupt-0111',
  'ss-cdni-2621',
  'ss-pdcd-2611',
  'ss-ptfe-0111',
  'ss-cdrh-2402',
  'sa-ptpd-1411',
  'ss-ruzn-1401',
  'ss-cdcu-2423',
  'sa-nios-1401',
  'ol-osfe-1401'],
 617)