# Notebook for the Open Quantum Materials Database (OQMD) data processing

## Check dataset

In [12]:
# List the files in the dataset
!ls oqmd_dataset/

License               oqmd_dataset_utils.py targets.csv
config.json           split.json
graph_data.npz        summary.txt


In [6]:
# Load license
with open('oqmd_dataset/LICENSE', 'r') as f:
    license = f.read()

print("-"*20, "LICENSE", "-"*20)
print(license)
print("-"*20, "LICENSE", "-"*20)

-------------------- LICENSE --------------------
This work, "OQMD v1.2 for CGNN", is a derivative of "OQMD" (https://oqmd.org) by Chris Wolverton's group at Northwestern University, used under CC BY 4.0. "OQMD v1.2 for CGNN" © 2019 by Takenori Yamamoto is licensed under CC BY 4.0. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/

-------------------- LICENSE --------------------


In [8]:
with open('oqmd_dataset/summary.txt', 'r') as f:
    summary = f.read()

print("-"*20, "SUMMARY", "-"*20)
print(summary)
print("-"*20, "SUMMARY", "-"*20)

-------------------- SUMMARY --------------------

oqmd_data.py
562/562 [3:16:30<00:00, 19.71s/it]

mp_graph.py
[Parallel(n_jobs=-1)]: Done 562 out of 562 | elapsed: 186.1min finished

oqmd.py
Total Data: 561888
unique_z: 89
min z: 1
max z: 94
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
 73 74 75 76 77 78 79 80 81 82 83 89 90 91 92 93 94]
Unary formulas: 89
Multi formulas: 338047
Train formulas: 270528
Val formulas: 33804
Test formulas: 33804
Train: 449867
Val: 56289
Test: 55732


-------------------- SUMMARY --------------------


## Load, process, and save the dataset

In [13]:
import pandas as pd

dataset_path = "oqmd_dataset/targets.csv"
dataset = pd.read_csv(dataset_path, keep_default_na=False, na_values=['_'])

In [14]:
print("-"*20, "data structure", "-"*20)
print(dataset.head())
print("-"*20, "data structure", "-"*20)

-------------------- data structure --------------------
          name   formula  spacegroup  nelements  nsites  energy_per_atom  \
0  oqmd-823191  ZrZnNiMo         216          4       4        -6.399036   
1  oqmd-362450   DySc2Ir         225          3       4        -6.795189   
2  oqmd-758369       YZr          59          2      16        -7.445319   
3  oqmd-516941   CrMoAu2         225          3       4        -6.218335   
4  oqmd-344596      Ge3O         221          2       4        -4.382844   

   formation_energy_per_atom  band_gap  volume_per_atom  \
0                   0.157939       0.0        16.014199   
1                  -0.266899       0.0        19.931045   
2                   0.060478       0.0        27.308156   
3                   0.504200       0.0        15.436073   
4                   0.215750       0.0        15.868661   

   magnetization_per_atom  atomic_volume_per_atom  volume_deviation  
0                0.452473               22.799641         -0.

In [15]:
# Prepare the data for training

# only keep the columns ['formula', 'energy_per_atom', 'formation_energy_per_atom', 'band_gap', 'magnetization_per_atom']

dataset = dataset[['formula', 'energy_per_atom', 'formation_energy_per_atom', 'band_gap', 'magnetization_per_atom']]

print("-"*20, "data structure", "-"*20)
print(dataset.head())
print("-"*20, "data structure", "-"*20)

-------------------- data structure --------------------
    formula  energy_per_atom  formation_energy_per_atom  band_gap  \
0  ZrZnNiMo        -6.399036                   0.157939       0.0   
1   DySc2Ir        -6.795189                  -0.266899       0.0   
2       YZr        -7.445319                   0.060478       0.0   
3   CrMoAu2        -6.218335                   0.504200       0.0   
4      Ge3O        -4.382844                   0.215750       0.0   

   magnetization_per_atom  
0                0.452473  
1                0.212367  
2                0.002801  
3                0.715628  
4                0.000000  
-------------------- data structure --------------------


In [17]:
# Save the data to a csv file
import os

cur_dir = os.getcwd()
dataset_path = os.path.join(cur_dir, "../data/datasets/oqmd.csv")
dataset.to_csv(dataset_path, index=False)