# Prepare Open Catalyst project data into graph format

---

### Overview
1. Inspect raw data
2. Process raw data
---

In [1]:
import hyper
import prep_open_catalyst

HYPER = hyper.HyperParameter()

### 1. Inspect raw data

We print out every information that the raw data files contain and that we can deduce from their atomic models.

In [2]:
sample_dataset, sample_datapoint = prep_open_catalyst.import_raw_data_samples(HYPER)
prep_open_catalyst.print_features_of_datapoint(sample_datapoint)

Importing sample file 105.extxyz.xz 

The sampled dataset contains 95 data points, each consisting of a constellation of atoms.

 A single datapoint contains the following properties.

 Global number of atoms:
 100

 Chemical formula:
 C2HAl48ORe48

 Symbols concise:
 Al48Re48C2HO

 Energy:
 -729.60778439

 Free energy:
 -729.61016215

 Volume:
 5585.237204315779

 Center of mass:
 [ 8.76943345  5.9739095  17.60965115]

 Periodic boundary condition (pbc):
 [ True  True  True]

 Cell:
 Cell([[13.19698417, 0.0, 4.39899472], [4.39899472, 12.02606366, -4.39899472], [0.0, 0.0, 35.19195779]])

 Symbols extensive:
 ['Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re',

### 2. Process raw data

We choose the following information as for saving in order to deduce the essential features and labels to our supervised learning task:
* number of atoms (n_atoms)
* structure atom symbols (symbols_atoms)
* energy (energy)
* positions (pos_x1, pos_y1, pos_z1, ..., pos_x(n_atoms), pos_y(n_atoms), pos_z(n_atoms))
* forces (force_x1, force_y1, force_z1, ..., force_x(n_atoms), force_y(n_atoms), force_z(n_atoms))


The following will be used as features:
* positions (pos_x1, pos_y1, pos_z1, ..., pos_x(n_atoms), pos_y(n_atoms), pos_z(n_atoms))


The following will be used as labels:
* energy (energy)
* forces (force_x1, force_y1, force_z1, ..., force_x(n_atoms), force_y(n_atoms), force_z(n_atoms))


The following will be used to more easily process data and dynamically augment features with atom information about mass, electronegativity, and similar constant atom properties:
* number of atoms (n_atoms)
* structure atom symbols (symbols_atoms)



In [3]:
df_training, df_validation, df_testing = prep_open_catalyst.process_raw_data(HYPER)

display(df_training)
display(df_validation)
display(df_testing)

Unnamed: 0,n_atoms,symbols_atoms,energy,pos_x0,pos_y0,pos_z0,pos_x1,pos_y1,pos_z1,pos_x2,...,force_z217,force_x218,force_y218,force_z218,force_x219,force_y219,force_z219,force_x220,force_y220,force_z220
0,100,Al48Re48C2HO,-729.607784,5.498743,2.236723,16.129647,1.099749,2.236723,16.129647,3.299246,...,,,,,,,,,,
0,59,Zr36Co6Te12NH3O,-396.740199,6.778714,4.399073,19.658950,13.540526,0.448924,23.093010,5.161429,...,,,,,,,,,,
0,52,Au12Nb36NH3,-376.166771,5.375560,4.150542,18.611123,3.033585,3.597201,20.164822,5.615150,...,,,,,,,,,,
0,161,Tl80Sn20S60N,-506.784264,5.868771,1.146471,29.786739,14.882965,12.980509,41.343782,17.200255,...,,,,,,,,,,
0,68,Sb16Pt48N2H2,-330.498068,2.010649,6.305745,13.069215,0.000000,2.347826,11.058567,0.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,45,Ga18P18C2H5O2,-184.109465,0.160982,3.376109,16.264987,1.684315,0.848926,10.578389,-1.292035,...,,,,,,,,,,
0,126,Ta72Ga48C2H2O2,-937.256679,14.035666,2.094278,13.679034,7.833401,5.195411,17.209107,16.512517,...,,,,,,,,,,
0,28,K8Na8Se8C2HO,-87.313297,4.261174,6.126177,9.480781,0.113061,7.293140,9.480781,4.035052,...,,,,,,,,,,
0,94,Nb36Zn6S48N2HO,-605.946977,2.558971,12.139895,15.057044,2.558971,3.705012,19.926926,2.558971,...,,,,,,,,,,


Unnamed: 0,n_atoms,symbols_atoms,energy,pos_x0,pos_y0,pos_z0,pos_x1,pos_y1,pos_z1,pos_x2,...,force_z196,force_x197,force_y197,force_z197,force_x198,force_y198,force_z198,force_x199,force_y199,force_z199
0,77,Na36Cl36CH4,-238.426959,2.304497,5.410228,11.725048,2.304497,12.623865,7.733542,1.152248,...,,,,,,,,,,
0,113,Na18H40Pd54C,-419.643047,0.000000,9.053672,18.158431,0.000000,13.864958,14.991774,0.000000,...,,,,,,,,,,
0,40,K12Ag12Se12N2H2,-113.054840,2.329952,1.802302,20.382066,2.330030,10.486423,25.406623,2.330095,...,,,,,,,,,,
0,114,Bi32Pb16S64CN,-449.719246,10.750582,1.437743,11.145553,10.750582,16.223989,11.145553,1.225603,...,,,,,,,,,,
0,73,W24N48O,-656.765612,5.609639,9.217846,17.729293,5.609639,14.070460,14.145487,5.609639,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,65,Sr20Al40C2H2O,-201.365876,7.135295,1.717761,22.196052,10.416882,10.043110,25.720931,6.568591,...,,,,,,,,,,
0,72,Hf40Sn24C2H4O2,-495.871798,1.622173,0.155004,24.010325,1.781187,4.686857,21.104466,9.137697,...,,,,,,,,,,
0,35,K8Pb24NO2,-99.735432,-0.156848,2.302150,15.557514,0.000000,5.971335,11.459574,0.124318,...,,,,,,,,,,
0,79,Y12Si24Rh36C2H4O,-519.397246,2.788454,2.767648,21.588815,0.002904,7.539013,21.991052,2.773502,...,,,,,,,,,,


Unnamed: 0,n_atoms,symbols_atoms,energy,pos_x0,pos_y0,pos_z0,pos_x1,pos_y1,pos_z1,pos_x2,...,force_z217,force_x218,force_y218,force_z218,pos_x219,pos_y219,pos_z219,force_x219,force_y219,force_z219
0,50,K8Al8Sb32CH,-164.479406,3.244017,1.212120,21.432079,1.081339,17.275058,7.006287,3.244017,...,,,,,,,,,,
0,111,Fe72N36CHO,-820.086319,1.083596,5.448638,14.181446,1.083596,1.352707,16.546233,1.083596,...,,,,,,,,,,
0,147,Nb36Se108NH2,-766.259492,0.885124,18.661423,31.260204,0.882175,8.694135,34.724023,2.655372,...,,,,,,,,,,
0,51,Pd24Zr24NH2,-330.689572,3.446176,2.350182,19.557227,1.115743,-1.465480,20.017677,3.345106,...,,,,,,,,,,
0,55,K12Au12Se24C2H3O2,-179.391489,4.122388,1.691332,23.006517,0.237789,5.354549,22.875313,3.997387,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,16,Hg12NO2H,-19.997564,2.797863,4.229938,17.753260,5.845461,7.454179,11.116291,1.208759,...,,,,,,,,,,
0,93,Ta90CHO,-984.773643,13.195617,10.423384,14.847004,10.814500,7.320306,11.198773,15.576734,...,,,,,,,,,,
0,68,Ti48In16NO2H,-399.334529,2.088981,2.618984,23.188913,2.097521,5.548427,20.407075,2.117128,...,,,,,,,,,,
0,68,Na64NO2H,-92.481522,0.088511,9.078440,17.454201,-0.008798,21.560545,13.791487,3.384673,...,,,,,,,,,,
