In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

# read in features

In [4]:
features = pd.read_csv('final_feature_array.csv')

# group feature names by units

In [8]:
ptable_number_feats = ['e1_Number','e1_MendeleevNumber','e1_Column', 'e1_Row'] 
ptable_number_feats += [name.replace('e1','e2') for name in ptable_number_feats]
mass_feats = ['e1_AtomicWeight']
mass_feats += [name.replace('e1','e2') for name in mass_feats]
temperature_feats = ['e1_MeltingT']
temperature_feats += [name.replace('e1','e2') for name in temperature_feats]
radii_feats = ['e1_CovalentRadius']
radii_feats += [name.replace('e1','e2') for name in radii_feats]
electron_number_feats = ['e1_NsValence',
       'e1_NpValence', 'e1_NdValence', 'e1_NfValence', 'e1_NValence',
       'e1_NsUnfilled', 'e1_NpUnfilled', 'e1_NdUnfilled', 'e1_NfUnfilled',
       'e1_NUnfilled']
electron_number_feats += [name.replace('e1','e2') for name in electron_number_feats]

others = ['e1_Electronegativity','e1_GSbandgap', 'e1_GSmagmom',
       'e1_SpaceGroupNumber','e1_GSvolume_pa'] 
others += [name.replace('e1','e2') for name in others]

# get pairwise combinations of feature names per group

In [13]:
all_groups = [ptable_number_feats,mass_feats,electron_number_feats,temperature_feats,radii_feats]

all_dimensionless_features = []
all_dimensionless_feature_names = []

for group in all_groups:
    pairs_of_features = list(combinations(group,2))
    offset = 0
    if 'e1_NpValence' in group: offset = 1
    for feature_pair in pairs_of_features:
        f1, f2 = feature_pair[0], feature_pair[1]
        dimensionless_feature = ((features[f1]+offset) / (features[f2]+offset)).to_numpy()
        dimensionless_feature_names = f"{f1} / {f2}"

        all_dimensionless_features += [dimensionless_feature]
        all_dimensionless_feature_names += [dimensionless_feature_names]

dimensionless_feature_array = np.vstack(all_dimensionless_features).T
dimensionless_feature_df = pd.DataFrame(data = dimensionless_feature_array,columns=all_dimensionless_feature_names)

dimensionless_feature_df.to_csv('final_dimensionless_feature_array.csv')

# clean out feature names with NaNs

In [14]:
number_of_nans_per_feature = np.sum(dimensionless_feature_df.isna().to_numpy(),axis=0)

print(number_of_nans_per_feature)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
