In [None]:
import pandas as pd
import numpy as np
from itertools import combinations

# read in features

In [None]:
filename = 'cropped_final_feature_array.csv'
features = pd.read_csv(filename)
print(features.columns)

# group feature names by units

In [None]:
feat_groups = [#['e1_Number', -- I don't think we want to encode atomic identity -- we want to be more general
               ['e1_avg_oxi_pos', 'e1_avg_oxi_neg'],
               ['e1_MendeleevNumber'],
               ['e1_Column', 'e1_Row'],
               ['e1_AtomicWeight'],
               ['e1_MeltingT'],
               ['e1_CovalentRadius'],
               ['e1_NsValence','e1_NpValence', 'e1_NdValence', 'e1_NfValence', 'e1_NValence',
                'e1_NsUnfilled', 'e1_NpUnfilled', 'e1_NdUnfilled', 'e1_NfUnfilled', 'e1_NUnfilled'],
               ['e1_Electronegativity'],
#                ['e1_GSbandgap'],
#                ['e1_GSmagmom'],
               ['e1_GSvolume_pa']]

for i in range(len(feat_groups)):
    feat_groups[i] += [name.replace('e1','e2') for name in feat_groups[i]]

print(feat_groups)

# get pairwise combinations of feature names per group

In [None]:
all_dimensionless_features = []
all_dimensionless_feature_names = []

for group in feat_groups:
    pairs_of_features = list(combinations(group,2))
    print(pairs_of_features)
    offset = 0
    # Add offset for electron count features to avoid div by zero
    if ('e1_NpValence' in group) or ('e1_avg_oxi_pos' in group): offset = 1
    # Calculate all the dimensionless versions of the feature
    for feature_pair in pairs_of_features:
        f1, f2 = feature_pair[0], feature_pair[1]
        all_dimensionless_features += [((features[f1]+offset) / (features[f2]+offset)).to_numpy()]
#         all_dimensionless_feature_names += [f"e1/e2_{f1[3:]}"]
        all_dimensionless_feature_names += [f"{f1}/{f2}"]

        all_dimensionless_features += [(((features[f1]+offset) + (features[f2]+offset)) / (features[f1]+offset)).to_numpy()]
#         all_dimensionless_feature_names += [f"(e1+e2)/e1_{f1[3:]}"]
        all_dimensionless_feature_names += [f"({f1}+{f2})/{f1}"]
        
        all_dimensionless_features += [(((features[f1]+offset) - (features[f2]+offset)) / (features[f1]+offset)).to_numpy()]
#         all_dimensionless_feature_names += [f"(e1-e2)/e1_{f1[3:]}"]
        all_dimensionless_feature_names += [f"({f1}-{f2})/{f1}"]

        all_dimensionless_features += [(((features[f1]+offset) + (features[f2]+offset)) / (features[f2]+offset)).to_numpy()]
#         all_dimensionless_feature_names += [f"(e1+e2)/e2_{f1[3:]}"]
        all_dimensionless_feature_names += [f"({f1}+{f2})/{f2}"]

        all_dimensionless_features += [(((features[f1]+offset) - (features[f2]+offset)) / (features[f2]+offset)).to_numpy()]
#         all_dimensionless_feature_names += [f"(e1-e2)/e2_{f1[3:]}"]
        all_dimensionless_feature_names += [f"({f1}-{f2})/{f2}"]

dimensionless_feature_array = np.vstack(all_dimensionless_features).T
dimensionless_feature_df = pd.DataFrame(data = dimensionless_feature_array,columns=all_dimensionless_feature_names)

In [None]:
dimensionless_feature_df

In [None]:
dimensionless_feature_df.columns

In [None]:
### Merge new dimensionless features with existing dimensionless features
existing_features = ['max_ionic_char', 'avg_ionic_char', 'chg_dispro']
dimensionless_feature_df = dimensionless_feature_df.merge(features[existing_features], left_index=True, right_index=True)
dimensionless_feature_df['chg_dispro'] = dimensionless_feature_df.chg_dispro.astype(int)
dimensionless_feature_df.to_csv(f'dimensionless_{filename}')

# check for no NaNs

In [None]:
number_of_nans_per_feature = np.sum(dimensionless_feature_df.isna().to_numpy(),axis=0)

print(sum(number_of_nans_per_feature))
print(number_of_nans_per_feature)
