# Prepare Open Catalyst project data into graph format

---

### Overview
1. Inspect raw data
2. Process raw data
---

In [1]:
import hyper
import prep_open_catalyst

HYPER = hyper.HyperParameter()

### 1. Inspect raw data

We print out every information that the raw data files contain and that we can deduce from their atomic models.

In [2]:
sample_dataset, sample_datapoint = prep_open_catalyst.import_raw_data_samples(HYPER)
prep_open_catalyst.print_features_of_datapoint(sample_datapoint)

Importing sample file 105.extxyz.xz 

The sampled dataset contains 95 data points, each consisting of a constellation of atoms.

 A single datapoint contains the following properties.

 Global number of atoms:
 100

 Chemical formula:
 C2HAl48ORe48

 Symbols concise:
 Al48Re48C2HO

 Energy:
 -729.60778439

 Free energy:
 -729.61016215

 Volume:
 5585.237204315779

 Center of mass:
 [ 8.76943345  5.9739095  17.60965115]

 Periodic boundary condition (pbc):
 [ True  True  True]

 Cell:
 Cell([[13.19698417, 0.0, 4.39899472], [4.39899472, 12.02606366, -4.39899472], [0.0, 0.0, 35.19195779]])

 Symbols extensive:
 ['Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Al', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re', 'Re',

### 2. Process raw data

We choose the following information as for saving in order to deduce the essential features and labels to our supervised learning task:
* number of atoms (n_atoms)
* structure atom symbols (symbols_atoms)
* energy (energy)
* positions (pos_x1, pos_y1, pos_z1, ..., pos_x(n_atoms), pos_y(n_atoms), pos_z(n_atoms))
* forces (force_x1, force_y1, force_z1, ..., force_x(n_atoms), force_y(n_atoms), force_z(n_atoms))


The following will be used as features:
* positions (pos_x1, pos_y1, pos_z1, ..., pos_x(n_atoms), pos_y(n_atoms), pos_z(n_atoms))


The following will be used as labels:
* energy (energy)
* forces (force_x1, force_y1, force_z1, ..., force_x(n_atoms), force_y(n_atoms), force_z(n_atoms))


The following will be used to more easily process data and dynamically augment features with atom information about mass, electronegativity, and similar constant atom properties:
* number of atoms (n_atoms)
* structure atom symbols (symbols_atoms)



In [2]:
df_training, df_validation, df_testing = prep_open_catalyst.process_raw_data(HYPER)

display(df_training)
display(df_validation)
display(df_testing)

Unnamed: 0,n_atoms,symbols_atoms,energy,pos_x0,pos_y0,pos_z0,pos_x1,pos_y1,pos_z1,pos_x2,...,force_z217,force_x218,force_y218,force_z218,force_x219,force_y219,force_z219,force_x220,force_y220,force_z220
0,51,Pt32Si16NO2,-295.518543,2.059192,1.413449,17.409729,0.000000,4.240346,15.472959,0.155177,...,,,,,,,,,,
0,80,Zn24Cl48N2OH4C,-217.602489,2.870703,11.327147,19.662934,2.870703,1.258572,13.241944,2.766959,...,,,,,,,,,,
0,57,Y54C2O,-331.155190,0.760047,0.000000,13.954261,0.760047,8.779594,11.847957,-1.064066,...,,,,,,,,,,
0,101,Cs16Ga32Au48N2H2O,-272.592324,4.542001,12.707925,13.783304,4.542001,5.871794,11.546369,1.514001,...,,,,,,,,,,
0,72,In16Rh32Zr16N2OH4C,-450.640726,0.000000,0.000000,17.390942,-0.036017,-0.009277,22.296030,2.318792,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,87,Ga32S48C2H3O2,-347.802563,0.024104,8.805467,24.856086,3.139496,8.805467,20.266154,0.024104,...,,,,,,,,,,
0,70,Sr16As48C2H2O2,-291.717012,15.452737,3.253222,14.826098,8.679610,7.176434,14.826098,13.956446,...,,,,,,,,,,
0,54,Y32In16CH4O,-256.917260,0.000000,0.000000,18.458667,0.000000,6.850464,12.989432,0.000000,...,,,,,,,,,,
0,77,V36S36C2H2O,-518.230942,0.000000,0.000000,14.347542,0.000000,3.081482,14.347542,0.000000,...,,,,,,,,,,


Unnamed: 0,n_atoms,symbols_atoms,energy,pos_x0,pos_y0,pos_z0,pos_x1,pos_y1,pos_z1,pos_x2,...,force_z216,force_x217,force_y217,force_z217,force_x218,force_y218,force_z218,force_x219,force_y219,force_z219
0,56,Bi32Ir16C2H5O,-265.813086,9.129397,0.131344,17.598811,12.019668,6.106863,20.370808,15.269957,...,,,,,,,,,,
0,27,Hg6K18C2H,-32.698617,7.603095,4.856809,18.789292,2.472916,2.914887,13.431425,5.092625,...,,,,,,,,,,
0,70,Sc48Cd16C2H2O2,-322.576698,4.166095,6.598977,25.639184,3.982720,0.947346,22.307611,3.982720,...,,,,,,,,,,
0,78,Nb18Rh54O4N2,-563.919910,11.712829,1.997234,16.670584,0.000000,2.007308,12.681791,-0.024537,...,,,,,,,,,,
0,100,Bi32Rh32Se32CH3,-467.607730,4.048568,5.588783,18.504596,5.588783,0.836118,15.292145,0.836118,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,57,Nb16Rh16Sb16C2H6O,-371.320339,0.000000,2.211061,17.690106,2.211061,0.000000,14.563193,0.007384,...,,,,,,,,,,
0,54,Sr16Ga16Ge16O4N2,-184.892217,0.000000,0.000000,24.987810,0.000492,0.005566,34.690785,0.000000,...,,,,,,,,,,
0,33,Cs6Pb18C2H6O,-106.079745,-2.099722,11.997883,16.884932,-0.426327,7.246034,13.372144,1.082901,...,,,,,,,,,,
0,117,Na32Al32Sb48CH3O,-375.042785,8.043329,0.499379,15.357031,8.043329,3.181778,23.197323,6.300664,...,,,,,,,,,,


Unnamed: 0,n_atoms,symbols_atoms,energy,pos_x0,pos_y0,pos_z0,pos_x1,pos_y1,pos_z1,pos_x2,...,force_z219,force_x220,force_y220,force_z220,force_x221,force_y221,force_z221,force_x222,force_y222,force_z222
0,51,Zr36Al12CHO,-337.319898,2.082514,0.291644,12.913435,3.439563e+00,4.005897,16.145344,4.799417,...,,,,,,,,,,
0,82,Sr16Tc16N48CH,-580.090151,-0.427930,1.488007,17.673698,-9.909345e-01,4.694008,20.308623,-1.283790,...,,,,,,,,,,
0,79,Fe48Nb24C2H3O2,-611.445442,2.792569,1.581581,11.873821,5.290659e+00,5.244666,13.637481,-1.450700,...,,,,,,,,,,
0,63,Al12Au48NH2,-184.189977,1.487445,4.739856,20.297399,1.973757e-01,5.584111,17.213221,2.167380,...,,,,,,,,,,
0,51,Ag12Ca24Ge12CHO,-151.544090,2.628918,1.517807,20.928400,5.000000e-08,3.035613,16.635395,-0.034536,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,43,K20As16C2H3O2,-133.649321,0.786734,1.325483,18.650389,5.509343e+00,6.322729,12.887647,2.660186,...,,,,,,,,,,
0,172,Na40Sb40S80N2C2H8,-659.593899,8.456584,2.898919,24.566519,1.550341e+01,1.529061,20.062331,3.854873,...,,,,,,,,,,
0,99,V48Ir48CHO,-808.338240,8.695449,7.150742,15.166351,8.695449e+00,0.000000,12.377137,5.621147,...,,,,,,,,,,
0,72,Ca4Fe16Sb48NO2H,-330.294158,5.775886,12.513334,16.358461,6.012250e+00,5.809998,27.578243,0.432209,...,,,,,,,,,,
