In [184]:
import pandas as pd
import numpy as np

import sklearn.preprocessing
import sklearn.feature_selection
import sklearn.pipeline
import matplotlib.pyplot as plt
import ase
import xgboost

from dscribe.descriptors import SineMatrix


In [2]:
# Read the data
datafile = "/Users/mat3ra/sisso_collab/DigitalEcosystem/data/2d_mat_dataset_raw.pkl"
data = pd.read_pickle(datafile)
initial_size = len(data)
data.head()

data.describe()

Unnamed: 0,energy_per_atom (eV),decomposition_energy (eV/atom),exfoliation_energy_per_atom (eV/atom),energy_vdw_per_atom (eV/atom),total_magnetization (Bohr Magneton),bandgap (eV),ave:atomic_number,ave:atomic_radius,ave:atomic_radius_rahm,ave:atomic_volume,...,min:num_s_valence,min:period,min:specific_heat,min:thermal_conductivity,min:vdw_radius,min:vdw_radius_alvarez,min:vdw_radius_mm3,min:vdw_radius_uff,min:sound_velocity,min:Polarizability
count,6156.0,6155.0,4527.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,...,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0,6351.0
mean,-4.881024,0.267288,0.150947,-3.906346,1.141026,1.048038,30.551707,145.693877,218.450531,16.383647,...,1.694379,2.95733,0.321291,7.61077,172.140293,179.218863,207.894032,331.31636,1768.938491,3.019394
std,1.594898,0.354539,0.204484,1.749797,3.204535,1.44389,16.259585,16.673223,22.887152,4.397442,...,0.494663,1.144204,0.221893,23.357869,24.09242,31.254298,26.685245,49.932611,1462.546851,2.935489
min,-10.724971,0.0,-1.218706,-9.769199,-10.014716,0.0,2.0,85.0,163.0,4.6,...,0.0,1.0,0.113,0.00565,110.0,120.0,162.0,245.1,206.0,0.557
25%,-5.960577,0.039261,0.058429,-5.128683,-0.0,0.0,17.5,137.258855,197.569231,13.65,...,1.0,2.0,0.159,0.02658,152.0,150.0,182.0,292.9,317.5,0.802
50%,-4.794826,0.146934,0.08923,-3.912666,0.0,0.2215,28.0,145.375888,222.272727,15.643333,...,2.0,3.0,0.234,0.12,175.0,182.0,215.0,317.0,1790.0,2.9
75%,-3.748102,0.359714,0.166131,-2.637456,0.245163,1.82205,41.732143,153.826394,236.0,18.3,...,2.0,4.0,0.443,0.52,190.0,191.0,229.0,352.2,2826.915883,4.31
max,-0.216389,3.469618,3.056498,1.782194,44.005684,9.4494,85.666667,235.0,293.0,47.45,...,2.0,6.0,1.824,235.0,268.0,303.0,307.0,450.0,18350.0,39.7


# Ideas for Descriptors
- Similar to the BCM, some measure of how under-coordinated the atoms are relative to their bulk versions
    - Might be harder for things like Oxygen, for-which we could just use number of covalent bonds or something
- Keep the weighted averages, they might be useful
- If we're looking at decomposition energies, what about the energy of the constituent elements?

In [138]:
target_col = "decomposition_energy (eV/atom)"
def should_keep_col(col):
    if "ave" not in col:
        return False
    
    if "num" in col:
        return False
    
    radius_to_keep= "atomic_radius"
    if "radius" in col and col != radius_to_keep:
        return False
    
    return True
average_cols = data.columns[[True if should_keep_col(i) else False for i in data.columns]]
separated_atoms_col = "sum:gs_energy"
atoms_obj_col = "atoms_object (unitless)"

new_data = data[[target_col] + [atoms_obj_col] + list(average_cols) + [separated_atoms_col]].dropna().reset_index()

In [107]:
largest_system =  new_data["atoms_object (unitless)"].apply(len).max()
sm = SineMatrix(
    n_atoms_max = largest_system,
    permutation = "eigenspectrum",
    sparse = False,
    flatten = True
)

In [108]:
def get_sm(atoms):
    new_cols = sm.create(atoms).reshape(1,-1).flatten()
    return new_cols

raw_soap = new_data["atoms_object (unitless)"].apply(get_sm)
refined_soap = np.vstack(raw_soap)
# This results on some very small (e.g. 10^-14) imaginary components. We'll remove those.
refined_soap = np.real(refined_soap)
soap_df = pd.DataFrame(refined_soap, columns=[f"sine_eigenspectrum_{i}" for i in range(sm.n_atoms_max)])
soap_df

Unnamed: 0,sine_eigenspectrum_0,sine_eigenspectrum_1,sine_eigenspectrum_2,sine_eigenspectrum_3,sine_eigenspectrum_4,sine_eigenspectrum_5,sine_eigenspectrum_6,sine_eigenspectrum_7,sine_eigenspectrum_8,sine_eigenspectrum_9,...,sine_eigenspectrum_30,sine_eigenspectrum_31,sine_eigenspectrum_32,sine_eigenspectrum_33,sine_eigenspectrum_34,sine_eigenspectrum_35,sine_eigenspectrum_36,sine_eigenspectrum_37,sine_eigenspectrum_38,sine_eigenspectrum_39
0,16856.141256,118.039693,68.861622,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8731.348669,7374.156902,5853.653872,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,23781.616801,19590.908986,19194.851587,17739.064135,17049.645133,16885.048067,447.308965,374.486486,366.879469,360.779234,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4280.269153,3656.874767,527.097019,411.929779,393.559148,392.251081,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16855.872988,88.739153,50.397620,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6150,18475.328014,379.254223,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6151,19377.892842,15411.764801,390.291055,351.878739,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6152,19497.938525,15419.683739,2364.116363,2211.543868,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6153,19433.298519,15353.846145,403.942694,340.740078,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [194]:
data_scaled = pd.concat([new_data, soap_df], axis=1).drop(columns=["atoms_object (unitless)", "index"])
data_means = data_scaled.mean()
data_std = data_scaled.std()
data_scaled = ((data_scaled - data_means) / data_std)
data_scaled

Unnamed: 0,decomposition_energy (eV/atom),ave:atomic_volume,ave:atomic_weight,ave:boiling_point,ave:bulk_modulus,ave:c6_gb,ave:density,ave:dipole_polarizability,ave:electron_negativity,ave:electron_affinity,...,sine_eigenspectrum_30,sine_eigenspectrum_31,sine_eigenspectrum_32,sine_eigenspectrum_33,sine_eigenspectrum_34,sine_eigenspectrum_35,sine_eigenspectrum_36,sine_eigenspectrum_37,sine_eigenspectrum_38,sine_eigenspectrum_39
0,1.501986,-0.480711,0.127937,0.052961,1.898666,-0.613208,1.362822,-0.825865,1.892067,1.477113,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
1,0.092331,3.607515,1.469130,0.513799,-1.110846,6.097466,-0.041786,4.689290,-2.306640,-1.352810,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
2,-0.253503,0.000032,1.132186,-0.306653,-0.984025,-0.351202,0.880802,-0.225914,-0.668282,-0.408589,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
3,0.616371,-0.171390,-0.383393,0.367845,0.349511,-0.198836,-0.086831,-0.147240,0.777719,1.319041,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
4,-0.085177,-0.953074,0.079528,0.057094,2.399237,-0.605308,1.373345,-0.793185,1.175700,-0.084632,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6150,-0.331450,-0.274243,1.086334,-0.965167,-1.214608,-0.570908,1.207282,-0.509046,-0.290199,0.707538,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
6151,0.344825,-0.925647,1.019661,1.111797,1.411472,-0.386756,2.728802,-0.348461,-0.011612,0.648566,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
6152,-0.010709,-0.811365,1.587501,1.255603,1.419158,-0.301973,3.252402,-0.327612,-0.041460,0.614494,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162
6153,-0.236884,-0.925647,1.019661,1.111797,1.411472,-0.386756,2.728802,-0.348461,-0.011612,0.648566,...,-0.062972,-0.06265,-0.054582,-0.054348,-0.051867,-0.051437,-0.036926,-0.03673,-0.036333,-0.025162


In [195]:
data_scaled.to_csv("new_test_dataset.csv")