In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

# read in features

In [2]:
filename = 'cropped_final_feature_array.csv'
features = pd.read_csv(filename)

In [3]:
print(features.columns)

Index(['Collection_Code', 'Formula', 'max_ionic_char', 'avg_ionic_char',
       'chg_dispro', 'e1_Number', 'e1_MendeleevNumber', 'e1_AtomicWeight',
       'e1_MeltingT', 'e1_Column', 'e1_Row', 'e1_CovalentRadius',
       'e1_Electronegativity', 'e1_NsValence', 'e1_NpValence', 'e1_NdValence',
       'e1_NfValence', 'e1_NValence', 'e1_NsUnfilled', 'e1_NpUnfilled',
       'e1_NdUnfilled', 'e1_NfUnfilled', 'e1_NUnfilled', 'e1_GSvolume_pa',
       'e1_GSbandgap', 'e1_GSmagmom', 'e2_Number', 'e2_MendeleevNumber',
       'e2_AtomicWeight', 'e2_MeltingT', 'e2_Column', 'e2_Row',
       'e2_CovalentRadius', 'e2_Electronegativity', 'e2_NsValence',
       'e2_NpValence', 'e2_NdValence', 'e2_NfValence', 'e2_NValence',
       'e2_NsUnfilled', 'e2_NpUnfilled', 'e2_NdUnfilled', 'e2_NfUnfilled',
       'e2_NUnfilled', 'e2_GSvolume_pa', 'e2_GSbandgap', 'e2_GSmagmom',
       'e1_avg_oxi_pos', 'e1_avg_oxi_neg', 'e2_avg_oxi_pos', 'e2_avg_oxi_neg'],
      dtype='object')


# group feature names by units

In [4]:
feat_groups = [#['e1_Number', -- I don't think we want to encode atomic identity -- we want to be more general
               ['e1_avg_oxi_pos', 'e1_avg_oxi_neg'],
               ['e1_MendeleevNumber'],
               ['e1_Column', 'e1_Row'],
               ['e1_AtomicWeight'],
               ['e1_MeltingT'],
               ['e1_CovalentRadius'],
               ['e1_NsValence','e1_NpValence', 'e1_NdValence', 'e1_NfValence', 'e1_NValence',
                'e1_NsUnfilled', 'e1_NpUnfilled', 'e1_NdUnfilled', 'e1_NfUnfilled', 'e1_NUnfilled'],
               ['e1_Electronegativity'],
#                ['e1_GSbandgap'],
#                ['e1_GSmagmom'],
               ['e1_GSvolume_pa']]

for i in range(len(feat_groups)):
    feat_groups[i] += [name.replace('e1','e2') for name in feat_groups[i]]

print(feat_groups)

[['e1_avg_oxi_pos', 'e1_avg_oxi_neg', 'e2_avg_oxi_pos', 'e2_avg_oxi_neg'], ['e1_MendeleevNumber', 'e2_MendeleevNumber'], ['e1_Column', 'e1_Row', 'e2_Column', 'e2_Row'], ['e1_AtomicWeight', 'e2_AtomicWeight'], ['e1_MeltingT', 'e2_MeltingT'], ['e1_CovalentRadius', 'e2_CovalentRadius'], ['e1_NsValence', 'e1_NpValence', 'e1_NdValence', 'e1_NfValence', 'e1_NValence', 'e1_NsUnfilled', 'e1_NpUnfilled', 'e1_NdUnfilled', 'e1_NfUnfilled', 'e1_NUnfilled', 'e2_NsValence', 'e2_NpValence', 'e2_NdValence', 'e2_NfValence', 'e2_NValence', 'e2_NsUnfilled', 'e2_NpUnfilled', 'e2_NdUnfilled', 'e2_NfUnfilled', 'e2_NUnfilled'], ['e1_Electronegativity', 'e2_Electronegativity'], ['e1_GSvolume_pa', 'e2_GSvolume_pa']]


In [5]:


# ptable_number_feats = ['e1_Number','e1_MendeleevNumber','e1_Column', 'e1_Row'] 
# ptable_number_feats += [name.replace('e1','e2') for name in ptable_number_feats]
# mass_feats = ['e1_AtomicWeight']
# mass_feats += [name.replace('e1','e2') for name in mass_feats]
# temperature_feats = ['e1_MeltingT']
# temperature_feats += [name.replace('e1','e2') for name in temperature_feats]
# radii_feats = ['e1_CovalentRadius']
# radii_feats += [name.replace('e1','e2') for name in radii_feats]
# electron_number_feats = ['e1_NsValence',
#        'e1_NpValence', 'e1_NdValence', 'e1_NfValence', 'e1_NValence',
#        'e1_NsUnfilled', 'e1_NpUnfilled', 'e1_NdUnfilled', 'e1_NfUnfilled',
#        'e1_NUnfilled']
# electron_number_feats += [name.replace('e1','e2') for name in electron_number_feats]

# others = ['e1_Electronegativity','e1_GSbandgap', 'e1_GSmagmom',
#        'e1_SpaceGroupNumber','e1_GSvolume_pa'] 
# others += [name.replace('e1','e2') for name in others]

# get pairwise combinations of feature names per group

In [6]:
all_dimensionless_features = []
all_dimensionless_feature_names = []

for group in feat_groups:
    pairs_of_features = list(combinations(group,2))
    offset = 0
    # Add offset for electron count features to avoid div by zero
    if ('e1_NpValence' in group) or ('e1_avg_oxi_pos' in group): offset = 1
    # Calculate all the dimensionless versions of the feature
    for feature_pair in pairs_of_features:
        f1, f2 = feature_pair[0], feature_pair[1]
        all_dimensionless_features += [((features[f1]+offset) / (features[f2]+offset)).to_numpy()]
        all_dimensionless_feature_names += [f"e1/e2_{f1[3:]}"]
        
        all_dimensionless_features += [(((features[f1]+offset) + (features[f2]+offset)) / (features[f1]+offset)).to_numpy()]
        all_dimensionless_feature_names += [f"(e1+e2)/e1_{f1[3:]}"]
        
        all_dimensionless_features += [(((features[f1]+offset) - (features[f2]+offset)) / (features[f1]+offset)).to_numpy()]
        all_dimensionless_feature_names += [f"(e1-e2)/e1_{f1[3:]}"]

        all_dimensionless_features += [(((features[f1]+offset) + (features[f2]+offset)) / (features[f2]+offset)).to_numpy()]
        all_dimensionless_feature_names += [f"(e1+e2)/e2_{f1[3:]}"]
        
        all_dimensionless_features += [(((features[f1]+offset) - (features[f2]+offset)) / (features[f2]+offset)).to_numpy()]
        all_dimensionless_feature_names += [f"(e1-e2)/e2_{f1[3:]}"]

dimensionless_feature_array = np.vstack(all_dimensionless_features).T
dimensionless_feature_df = pd.DataFrame(data = dimensionless_feature_array,columns=all_dimensionless_feature_names)


### Merge new dimensionless features with existing dimensionless features
existing_features = ['max_ionic_char', 'avg_ionic_char', 'chg_dispro']
dimensionless_feature_df = dimensionless_feature_df.merge(features[existing_features], left_index=True, right_index=True)
dimensionless_feature_df['chg_dispro'] = dimensionless_feature_df.chg_dispro.astype(int)
dimensionless_feature_df.to_csv(f'dimensionless_{filename}')

# check for no NaNs

In [7]:
number_of_nans_per_feature = np.sum(dimensionless_feature_df.isna().to_numpy(),axis=0)

print(sum(number_of_nans_per_feature))
print(number_of_nans_per_feature)


0
[0 0 0 ... 0 0 0]
