In [11]:
import functools
import random

import pandas as pd
import numpy as np

import sklearn.preprocessing
import sklearn.feature_selection
import sklearn.pipeline

import ase

from dscribe.descriptors import SineMatrix

In [12]:
# Read the data
datafile = "../raw_data/2d_mat_dataset_raw.pkl"
data = pd.read_pickle(datafile)
data = data[data["discovery_process (unitless)"] == "top-down"]
initial_size = len(data)

data.head()

Unnamed: 0,2dm_id (unitless),formula,discovery_process (unitless),atoms_object (unitless),potcars (unitless),is_hubbard (unitless),energy_per_atom (eV),decomposition_energy (eV/atom),exfoliation_energy_per_atom (eV/atom),energy_vdw_per_atom (eV/atom),...,min:num_s_valence,min:period,min:specific_heat,min:thermal_conductivity,min:vdw_radius,min:vdw_radius_alvarez,min:vdw_radius_mm3,min:vdw_radius_uff,min:sound_velocity,min:Polarizability
2979,2dm-2990,O,top-down,"(Atom('O', [0.0, 0.0, 10.0], index=0))",[PAW_PBE O 08Apr2002],False,-3.4164,1.519128,0.072404,-2.816888,...,2.0,2.0,1.079187,0.02658,152.0,150.0,182.0,350.0,317.5,0.802
2980,2dm-2991,C3N,top-down,"(Atom('C', [0.0, 2.368312135, 10.0001167924674...","[PAW_PBE C 08Apr2002, PAW_PBE N 08Apr2002]",False,-7.118129,1.788156,0.087979,-7.049869,...,2.0,2.0,0.711,0.02583,155.0,166.0,193.0,366.0,333.6,1.1
2981,2dm-2992,GaN,top-down,"(Atom('Ga', [-3.493867195558664e-07, 1.8541533...","[PAW_PBE Ga_d 06Sep2000, PAW_PBE N 08Apr2002]",False,-5.803293,0.354161,0.066609,-4.443864,...,2.0,2.0,0.372,0.02583,155.0,166.0,193.0,366.0,333.6,1.1
2982,2dm-2993,LiB,top-down,"(Atom('Li', [0.771551245, 1.5362468836519032, ...","[PAW_PBE Li_sv 23Jan2001, PAW_PBE B 06Sep2000]",False,-3.820308,0.6507,0.687758,-3.505639,...,1.0,2.0,1.025,27.0,182.0,191.0,215.0,245.1,6000.0,3.03
2983,2dm-2994,Ga,top-down,"(Atom('Ga', [0.0, 1.9967273025, 12.48843928789...",[PAW_PBE Ga_d 06Sep2000],False,-2.800399,0.236834,,-0.306596,...,2.0,4.0,0.372,29.0,187.0,232.0,246.0,438.3,2740.0,8.12


In [3]:
target_cols = ["decomposition_energy (eV/atom)", "exfoliation_energy_per_atom (eV/atom)", "bandgap (eV)"]
def should_keep_col(col):
    if "ave" not in col:
        return False
    
    if "num" in col:
        return False
    
    radius_to_keep= "atomic_radius"
    if "radius" in col and col != radius_to_keep:
        return False
    
    return True
average_cols = data.columns[[True if should_keep_col(i) else False for i in data.columns]]
separated_atoms_col = "sum:gs_energy"
atoms_obj_col = "atoms_object (unitless)"

new_data = data[target_cols + [atoms_obj_col] + list(average_cols) + [separated_atoms_col]].dropna().reset_index().drop(columns=["index"])
new_data.head()

Unnamed: 0,decomposition_energy (eV/atom),exfoliation_energy_per_atom (eV/atom),bandgap (eV),atoms_object (unitless),ave:atomic_volume,ave:atomic_weight,ave:boiling_point,ave:bulk_modulus,ave:c6_gb,ave:density,...,ave:heat_of_formation,ave:lattice_constant,ave:melting_point,ave:molar_volume,ave:period,ave:specific_heat,ave:thermal_conductivity,ave:sound_velocity,ave:Polarizability,sum:gs_energy
0,1.519128,0.072404,0.0,"(Atom('O', [0.0, 0.0, 10.0], index=0))",14.0,15.999,90.19,76.106041,16.7,1.149,...,249.229,6.83,54.8,17.36,2.0,1.079187,0.02658,317.5,0.802,-4.756792
1,1.788156,0.087979,0.0,"(Atom('C', [0.0, 2.368312135, 10.0001167924674...",8.3,12.51,3844.35,39.091221,42.35,1.8895,...,655.7625,3.68725,2880.8225,7.3525,2.0,0.726487,105.006457,13845.9,1.5275,-35.840582
2,0.354161,0.066609,2.1592,"(Atom('Ga', [-3.493867195558664e-07, 1.8541533...",14.55,41.865,1376.7,69.89102,240.85,3.359,...,372.2,4.2745,183.11,12.67,3.0,0.572474,14.512915,1536.8,4.61,-11.223697
3,0.6507,0.687758,0.0,"(Atom('Li', [0.771551245, 1.5362468836519032, ...",8.85,8.875,2524.575,165.5,754.6,1.437,...,362.15,6.11,1563.345,8.705,2.0,2.257,56.0,11100.0,13.68,-17.09517
4,1.02989,0.081753,0.6616,"(Atom('Sb', [2.1622899077079296, 1.24839878493...",19.66,125.264,1521.0,55.8,484.2,6.4204,...,223.72,4.474,795.18,19.552,5.0,0.2026,11.4,2934.0,5.94,-17.658632


In [4]:
largest_system =  new_data["atoms_object (unitless)"].apply(len).max()
sm = SineMatrix(
    n_atoms_max = largest_system,
    permutation = "eigenspectrum",
    sparse = False,
    flatten = True
)

def get_sm(atoms):
    new_cols = sm.create(atoms).reshape(1,-1).flatten()
    return new_cols

raw_sines = new_data["atoms_object (unitless)"].apply(get_sm)
refined_sines = np.vstack(raw_sines)

# This results on some very small (e.g. 10^-14) imaginary components. We'll remove those.
refined_sines = np.real(refined_sines)

sine_df = pd.DataFrame(refined_sines, columns=[f"sine_eigenspectrum_{i}" for i in range(sm.n_atoms_max)])
sine_df.head()

Unnamed: 0,sine_eigenspectrum_0,sine_eigenspectrum_1,sine_eigenspectrum_2,sine_eigenspectrum_3,sine_eigenspectrum_4,sine_eigenspectrum_5,sine_eigenspectrum_6,sine_eigenspectrum_7,sine_eigenspectrum_8,sine_eigenspectrum_9,...,sine_eigenspectrum_30,sine_eigenspectrum_31,sine_eigenspectrum_32,sine_eigenspectrum_33,sine_eigenspectrum_34,sine_eigenspectrum_35,sine_eigenspectrum_36,sine_eigenspectrum_37,sine_eigenspectrum_38,sine_eigenspectrum_39
0,73.516695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,77.022144,37.566835,29.052903,20.29114,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1902.135897,48.968744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,33.473615,15.695111,6.604666,5.784566,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7904.129492,6479.136296,6120.606666,5876.215194,5857.6844,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
featurized_data = pd.concat([new_data, sine_df], axis=1).drop(columns=["atoms_object (unitless)"])
featurized_data

Unnamed: 0,decomposition_energy (eV/atom),exfoliation_energy_per_atom (eV/atom),bandgap (eV),ave:atomic_volume,ave:atomic_weight,ave:boiling_point,ave:bulk_modulus,ave:c6_gb,ave:density,ave:dipole_polarizability,...,sine_eigenspectrum_30,sine_eigenspectrum_31,sine_eigenspectrum_32,sine_eigenspectrum_33,sine_eigenspectrum_34,sine_eigenspectrum_35,sine_eigenspectrum_36,sine_eigenspectrum_37,sine_eigenspectrum_38,sine_eigenspectrum_39
0,1.519128,0.072404,0.0000,14.000000,15.999000,90.190000,76.106041,16.700000,1.149000,5.240000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
1,1.788156,0.087979,0.0000,8.300000,12.510000,3844.350000,39.091221,42.350000,1.889500,17.297500,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
2,0.354161,0.066609,2.1592,14.550000,41.865000,1376.700000,69.891020,240.850000,3.359000,28.750000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
3,0.650700,0.687758,0.0000,8.850000,8.875000,2524.575000,165.500000,754.600000,1.437000,92.265000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
4,1.029890,0.081753,0.6616,19.660000,125.264000,1521.000000,55.800000,484.200000,6.420400,39.080000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2329,0.000000,0.123656,1.0021,17.962500,77.330871,799.325000,32.200000,190.325000,4.800875,29.460000,...,314.901853,306.883103,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
2330,0.232824,0.081378,0.1976,11.743333,27.998033,1041.793333,104.070694,247.466667,3.159333,29.626667,...,46.074641,39.697567,39.406868,37.025055,35.707178,5.584997,0.0,0.0,0.0,0.0
2331,0.221992,0.053196,0.9018,11.390000,53.561143,1710.108571,127.774880,193.114286,5.973714,26.994286,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
2332,0.347647,0.198842,0.0000,12.510000,37.553784,1267.214000,91.763625,349.220000,3.824400,32.394000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [6]:
# Get datasets
decomp_target = "decomposition_energy (eV/atom)"
exfol_target = "exfoliation_energy_per_atom (eV/atom)"
bg_target = "bandgap (eV)"

descriptor_cols = list(featurized_data.columns[3:])
def get_dataset_subset(target_column, feature_columns, full_dataset):
    # Generate Dataset
    initial_size = len(full_dataset)
    result_data = full_dataset[[target_column] + feature_columns].dropna()
    print(f"Dropped {initial_size - len(result_data)} missing rows for target {target_column}")
    return result_data

data_extractor = functools.partial(get_dataset_subset, feature_columns=descriptor_cols, full_dataset=featurized_data)

decomp_data = data_extractor(decomp_target)
exfol_data = data_extractor(exfol_target)
bg_data = data_extractor(bg_target)

Dropped 0 missing rows for target decomposition_energy (eV/atom)
Dropped 0 missing rows for target exfoliation_energy_per_atom (eV/atom)
Dropped 0 missing rows for target bandgap (eV)


In [7]:
# Train/Test Split
np.random.seed(1234)
random.seed(1234)

decomp_train, decomp_test = sklearn.model_selection.train_test_split(decomp_data, test_size=0.2)
exfol_train, exfol_test = sklearn.model_selection.train_test_split(exfol_data, test_size=0.2)
bg_train, bg_test = sklearn.model_selection.train_test_split(bg_data, test_size=0.2)

In [8]:
dataset_mean_path = "dataset_means_stds"
# Scale the dataset
def z_score_scale(dataset, mean=None, std=None):
    if mean is None:
        mean = dataset.mean()
    if std is None:
        std = dataset.std()
    
    result = (dataset - mean) / std
    
    return result.copy(), mean, std

decomp_scaled, decomp_mean, decomp_std = z_score_scale(decomp_train)
decomp_mean.to_pickle(f"{dataset_mean_path}/topdown_decomp_mean.pkl")
decomp_mean.to_pickle(f"{dataset_mean_path}/topdown_decomp_std.pkl")

exfol_scaled, exfol_mean, exfol_std = z_score_scale(exfol_train)
exfol_mean.to_pickle(f"{dataset_mean_path}/topdown_exfol_mean.pkl")
exfol_std.to_pickle(f"{dataset_mean_path}/topdown_exfol_std.pkl")

bg_scaled, bg_mean, bg_std = z_score_scale(bg_train)
bg_mean.to_pickle(f"{dataset_mean_path}/topdown_bg_mean.pkl")
bg_std.to_pickle(f"{dataset_mean_path}/topdown_bg_std.pkl")

In [9]:
# Write to CSV
decomp_scaled.to_csv("scaled_featurized_train/scaled_topdown_decomp_train.csv")
exfol_scaled.to_csv("scaled_featurized_train/scaled_topdown_exfol_train.csv")
bg_scaled.to_csv("scaled_featurized_train/scaled_topdown_bg_train.csv")

In [10]:
# Scale the test set

decomp_test_scaled, _, _ = z_score_scale(decomp_test, decomp_mean, decomp_std)
decomp_test_scaled.to_csv("scaled_featurized_test/scaled_topdown_decomp_test.csv")

exfol_test_scaled, _, _ = z_score_scale(exfol_test, exfol_mean, exfol_std)
exfol_test_scaled.to_csv("scaled_featurized_test/scaled_topdown_exfol_test.csv")

bg_test_scaled, _, _ = z_score_scale(bg_test, bg_mean, bg_std)
bg_test_scaled.to_csv("scaled_featurized_test/scaled_topdown_bg_test.csv")