In [1]:
import functools
import random

import pandas as pd
import numpy as np

import sklearn.preprocessing
import sklearn.feature_selection
import sklearn.pipeline

import ase

from dscribe.descriptors import SineMatrix

In [2]:
# Read the data
datafile = "raw_data/2d_mat_dataset_raw.pkl"
data = pd.read_pickle(datafile)
initial_size = len(data)
data.head()

data.describe()

Unnamed: 0,energy_per_atom (eV),decomposition_energy (eV/atom),exfoliation_energy_per_atom (eV/atom),energy_vdw_per_atom (eV/atom),total_magnetization (Bohr Magneton),bandgap (eV),ave:atomic_number,ave:atomic_radius,ave:atomic_radius_rahm,ave:atomic_volume,...,min:num_s_valence,min:period,min:specific_heat,min:thermal_conductivity,min:vdw_radius,min:vdw_radius_alvarez,min:vdw_radius_mm3,min:vdw_radius_uff,min:sound_velocity,min:Polarizability
count,6156.0,6155.0,4527.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,...,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0
mean,-4.881024,0.267288,0.150947,-3.906346,1.141026,1.048038,30.551707,145.693877,218.450531,16.383647,...,1.694379,2.95733,0.321291,7.61077,172.140293,179.218863,207.894032,331.31636,1768.938491,3.019394
std,1.594898,0.354539,0.204484,1.749797,3.204535,1.44389,16.259585,16.673223,22.887152,4.397442,...,0.494663,1.144204,0.221893,23.357869,24.09242,31.254298,26.685245,49.932611,1462.546851,2.935489
min,-10.724971,0.0,-1.218706,-9.769199,-10.014716,0.0,2.0,85.0,163.0,4.6,...,0.0,1.0,0.113,0.00565,110.0,120.0,162.0,245.1,206.0,0.557
25%,-5.960577,0.039261,0.058429,-5.128683,-0.0,0.0,17.5,137.258855,197.569231,13.65,...,1.0,2.0,0.159,0.02658,152.0,150.0,182.0,292.9,317.5,0.802
50%,-4.794826,0.146934,0.08923,-3.912666,0.0,0.2215,28.0,145.375888,222.272727,15.643333,...,2.0,3.0,0.234,0.12,175.0,182.0,215.0,317.0,1790.0,2.9
75%,-3.748102,0.359714,0.166131,-2.637456,0.245163,1.82205,41.732143,153.826394,236.0,18.3,...,2.0,4.0,0.443,0.52,190.0,191.0,229.0,352.2,2826.915883,4.31
max,-0.216389,3.469618,3.056498,1.782194,44.005684,9.4494,85.666667,235.0,293.0,47.45,...,2.0,6.0,1.824,235.0,268.0,303.0,307.0,450.0,18350.0,39.7


In [3]:
target_cols = ["decomposition_energy (eV/atom)", "exfoliation_energy_per_atom (eV/atom)", "bandgap (eV)"]
def should_keep_col(col):
    if "ave" not in col:
        return False
    
    if "num" in col:
        return False
    
    radius_to_keep= "atomic_radius"
    if "radius" in col and col != radius_to_keep:
        return False
    
    return True
average_cols = data.columns[[True if should_keep_col(i) else False for i in data.columns]]
separated_atoms_col = "sum:gs_energy"
atoms_obj_col = "atoms_object (unitless)"

new_data = data[target_cols + [atoms_obj_col] + list(average_cols) + [separated_atoms_col]].dropna().reset_index().drop(columns=["index"])
new_data.head()

Unnamed: 0,decomposition_energy (eV/atom),exfoliation_energy_per_atom (eV/atom),bandgap (eV),atoms_object (unitless),ave:atomic_volume,ave:atomic_weight,ave:boiling_point,ave:bulk_modulus,ave:c6_gb,ave:density,...,ave:heat_of_formation,ave:lattice_constant,ave:melting_point,ave:molar_volume,ave:period,ave:specific_heat,ave:thermal_conductivity,ave:sound_velocity,ave:Polarizability,sum:gs_energy
0,0.7998,0.234617,0.0,"(Atom('Ir', [0.0, 0.0, 0.0], index=0), Atom('F...",14.246667,76.737935,1524.34,137.86596,180.8,8.212,...,275.89,5.034,930.02,10.306667,3.333333,0.873564,50.018467,3492.943922,2.904667,-12.244633
1,0.300023,0.210645,0.0,"(Atom('Ba', [2.476683476681, 1.429910903420999...",32.133333,132.138,1909.333333,20.4,3861.333333,4.563667,...,207.533333,4.85,969.3,31.503333,5.666667,0.196333,20.0,2220.0,28.666667,-7.964407
2,0.177412,0.095794,0.9814,"(Atom('Tl', [2.63896615613751, 10.292177253854...",16.35,118.22,1223.912,25.35,324.5,6.96,...,229.685,6.965,481.3,16.375,4.5,0.43,23.1025,2322.236799,5.25,-38.219365
3,0.485815,-0.055818,0.0,"(Atom('Mo', [1.5833675, 2.687975714894, 2.6388...",15.6,55.616667,1787.4,77.4,408.066667,4.446667,...,300.528,5.21,1078.133333,14.72,3.666667,0.577305,46.339267,2200.666667,5.72,-28.829951
4,0.011699,0.084831,1.1619,"(Atom('Ru', [0.0, 0.0, 0.0], index=0), Atom('I...",19.9,118.29298,1696.0,78.466667,529.0,7.423333,...,288.038,6.046667,1118.8,19.87,5.0,0.733879,40.299333,4182.809958,6.766667,-12.205438


In [4]:
largest_system =  new_data["atoms_object (unitless)"].apply(len).max()
sm = SineMatrix(
    n_atoms_max = largest_system,
    permutation = "eigenspectrum",
    sparse = False,
    flatten = True
)

def get_sm(atoms):
    new_cols = sm.create(atoms).reshape(1,-1).flatten()
    return new_cols

raw_sines = new_data["atoms_object (unitless)"].apply(get_sm)
refined_sines = np.vstack(raw_sines)

# This results on some very small (e.g. 10^-14) imaginary components. We'll remove those.
refined_sines = np.real(refined_sines)

sine_df = pd.DataFrame(refined_sines, columns=[f"sine_eigenspectrum_{i}" for i in range(sm.n_atoms_max)])
sine_df.head()

Unnamed: 0,sine_eigenspectrum_0,sine_eigenspectrum_1,sine_eigenspectrum_2,sine_eigenspectrum_3,sine_eigenspectrum_4,sine_eigenspectrum_5,sine_eigenspectrum_6,sine_eigenspectrum_7,sine_eigenspectrum_8,sine_eigenspectrum_9,...,sine_eigenspectrum_30,sine_eigenspectrum_31,sine_eigenspectrum_32,sine_eigenspectrum_33,sine_eigenspectrum_34,sine_eigenspectrum_35,sine_eigenspectrum_36,sine_eigenspectrum_37,sine_eigenspectrum_38,sine_eigenspectrum_39
0,16856.141256,118.039693,68.861622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8731.348669,7374.156902,5853.653872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,23781.616801,19590.908986,19194.851587,17739.064135,17049.645133,16885.048067,447.308965,374.486486,366.879469,360.779234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4280.269153,3656.874767,527.097019,411.929779,393.559148,392.251081,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7822.386788,6252.02037,4072.319314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
featurized_data = pd.concat([new_data, sine_df], axis=1).drop(columns=["atoms_object (unitless)"])
featurized_data.head()

Unnamed: 0,decomposition_energy (eV/atom),exfoliation_energy_per_atom (eV/atom),bandgap (eV),ave:atomic_volume,ave:atomic_weight,ave:boiling_point,ave:bulk_modulus,ave:c6_gb,ave:density,ave:dipole_polarizability,...,sine_eigenspectrum_30,sine_eigenspectrum_31,sine_eigenspectrum_32,sine_eigenspectrum_33,sine_eigenspectrum_34,sine_eigenspectrum_35,sine_eigenspectrum_36,sine_eigenspectrum_37,sine_eigenspectrum_38,sine_eigenspectrum_39
0,0.7998,0.234617,0.0,14.246667,76.737935,1524.34,137.86596,180.8,8.212,19.466667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.300023,0.210645,0.0,32.133333,132.138,1909.333333,20.4,3861.333333,4.563667,192.733333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.177412,0.095794,0.9814,16.35,118.22,1223.912,25.35,324.5,6.96,38.315,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.485815,-0.055818,0.0,15.6,55.616667,1787.4,77.4,408.066667,4.446667,40.786667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.011699,0.084831,1.1619,19.9,118.29298,1696.0,78.466667,529.0,7.423333,44.733333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Get datasets
decomp_target = "decomposition_energy (eV/atom)"
exfol_target = "exfoliation_energy_per_atom (eV/atom)"
bg_target = "bandgap (eV)"

descriptor_cols = list(featurized_data.columns[3:])
def get_dataset_subset(target_column, feature_columns, full_dataset):
    # Generate Dataset
    initial_size = len(full_dataset)
    result_data = full_dataset[[target_column] + feature_columns].dropna()
    print(f"Dropped {initial_size - len(result_data)} missing rows for target {target_column}")
    return result_data

data_extractor = functools.partial(get_dataset_subset, feature_columns=descriptor_cols, full_dataset=featurized_data)

decomp_data = data_extractor(decomp_target)
exfol_data = data_extractor(exfol_target)
bg_data = data_extractor(bg_target)

Dropped 0 missing rows for target decomposition_energy (eV/atom)
Dropped 0 missing rows for target exfoliation_energy_per_atom (eV/atom)
Dropped 0 missing rows for target bandgap (eV)


In [7]:
# Train/Test Split
np.random.seed(1234)
random.seed(1234)

decomp_train, decomp_test = sklearn.model_selection.train_test_split(decomp_data, test_size=0.2)
exfol_train, exfol_test = sklearn.model_selection.train_test_split(exfol_data, test_size=0.2)
bg_train, bg_test = sklearn.model_selection.train_test_split(bg_data, test_size=0.2)

In [8]:
dataset_mean_path = "dataset_means_stds"
# Scale the dataset
def z_score_scale(dataset, mean=None, std=None):
    if mean is None:
        mean = dataset.mean()
    if std is None:
        std = dataset.std()
    
    result = (dataset - mean) / std
    
    return result.copy(), mean, std

decomp_scaled, decomp_mean, decomp_std = z_score_scale(decomp_train)
decomp_mean.to_pickle(f"{dataset_mean_path}/decomp_mean.pkl")
decomp_mean.to_pickle(f"{dataset_mean_path}/decomp_std.pkl")

exfol_scaled, exfol_mean, exfol_std = z_score_scale(exfol_train)
exfol_mean.to_pickle(f"{dataset_mean_path}/exfol_mean.pkl")
exfol_std.to_pickle(f"{dataset_mean_path}/exfol_std.pkl")

bg_scaled, bg_mean, bg_std = z_score_scale(bg_train)
bg_mean.to_pickle(f"{dataset_mean_path}/bg_mean.pkl")
bg_std.to_pickle(f"{dataset_mean_path}/bg_std.pkl")

In [9]:
# Write to CSV
decomp_scaled.to_csv("scaled_featurized_train/scaled_decomp_train.csv")
exfol_scaled.to_csv("scaled_featurized_train/scaled_exfol_train.csv")
bg_scaled.to_csv("scaled_featurized_train/scaled_bg_train.csv")

In [10]:
# Scale the test set

decomp_test_scaled, _, _ = z_score_scale(decomp_test, decomp_mean, decomp_std)
decomp_test_scaled.to_csv("scaled_featurized_test/scaled_decomp_test.csv")

exfol_test_scaled, _, _ = z_score_scale(exfol_test, exfol_mean, exfol_std)
exfol_test_scaled.to_csv("scaled_featurized_test/scaled_exfol_test.csv")

bg_test_scaled, _, _ = z_score_scale(bg_test, bg_mean, bg_std)
bg_test_scaled.to_csv("scaled_featurized_test/scaled_bg_test.csv")