# Feature Engineering
对影响mp_plus最重要的**十个特征X**进行如下操作：
- $X^2$
- $exp(X)$
- $log(|X|)$
- $1/X$
- $\sqrt{|X|}$

其中操作四五需要注意的是，$X$不能为0，因此在操作之前需要对$X$进行处理，将$X=0$的值替换为1e-6。

对影响mp_plus最重要的**五个特征Y**进行如下操作：
- $Y_i + Y_j$ (i,j = 1,2,3,4,5)
- $|Y_i - Y_j|$ (i,j = 1,2,3,4,5)
- $Y_i * Y_j$ (i,j = 1,2,3,4,5)
- $Y_i / Y_j$ (i,j = 1,2,3,4,5)

总共增加了$10*5 + 10*4 = 90$个特征。

In [19]:
# import libraries
import os
import sys
import numpy as np
import pandas as pd
# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [20]:
current_path = os.getcwd()
current_path

'c:\\Users\\Fortyfour\\Desktop\\graduation_design\\New_Scripts\\Bg_pre2'

In [28]:
# Load data
file_dir = os.path.join(current_path, './Data')
# dft data
dft_train = pd.read_csv(os.path.join(file_dir, 'train_data', 'mp_plus_train_set.csv'))
dft_test = pd.read_csv(os.path.join(file_dir, 'test_data', 'mp_plus_test_set.csv'))

# 
feat_importance_dir = os.path.join(current_path, 'metrics', 'feature_importance', 'mp_plus')
feat_importance = pd.read_csv(os.path.join(feat_importance_dir, 'feature_importance.csv'))

In [29]:
(dft_train[feat_importance['Feature']]==0).any()

MagpieData mode CovalentRadius          False
MagpieData maximum Electronegativity    False
MagpieData minimum NpUnfilled            True
MagpieData maximum NfUnfilled            True
MagpieData range Column                  True
MagpieData mode Number                  False
efermi                                  False
MagpieData maximum NUnfilled            False
MagpieData maximum MendeleevNumber      False
MagpieData avg_dev NfUnfilled            True
dtype: bool

In [31]:
# 进行特征工程1
for col in feat_importance['Feature']:
    # 计算对数值
    dft_train[col + '_log'] = np.log(np.abs(dft_train[col])+1e-6)  # 加1e-6避免对数为负无穷
    dft_test[col + '_log'] = np.log(np.abs(dft_test[col])+1e-6) 
    # 计算指数值
    dft_train[col + '_exp'] = np.exp(dft_train[col])
    dft_test[col + '_exp'] = np.exp(dft_test[col])
    # 计算倒数值
    dft_train[col + '_reciprocal'] = 1 / (dft_train[col]+1e-6)  # 加1e-6避免除数为0
    dft_test[col + '_reciprocal'] = 1 / (dft_test[col]+1e-6)
    # 计算平方值
    dft_train[col + '_square'] = dft_train[col] ** 2
    dft_test[col + '_square'] = dft_test[col] ** 2
    # 计算平方根值
    dft_train[col + '_sqrt'] = np.sqrt(np.abs(dft_train[col]))  # 加1e-6避免开方为负无穷
    dft_test[col + '_sqrt'] = np.sqrt(np.abs(dft_test[col]))

In [32]:
dft_train.head(5)

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,MagpieData maximum MendeleevNumber_log,MagpieData maximum MendeleevNumber_exp,MagpieData maximum MendeleevNumber_reciprocal,MagpieData maximum MendeleevNumber_square,MagpieData maximum MendeleevNumber_sqrt,MagpieData avg_dev NfUnfilled_log,MagpieData avg_dev NfUnfilled_exp,MagpieData avg_dev NfUnfilled_reciprocal,MagpieData avg_dev NfUnfilled_square,MagpieData avg_dev NfUnfilled_sqrt
0,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,...,4.477337,1.6516360000000001e+38,0.011364,7744.0,9.380832,-13.815511,1.0,1000000.0,0.0,0.0
1,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,...,4.49981,1.220403e+39,0.011111,8100.0,9.486833,-13.815511,1.0,1000000.0,0.0,0.0
2,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,...,4.465908,6.07603e+37,0.011494,7569.0,9.327379,-13.815511,1.0,1000000.0,0.0,0.0
3,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,...,4.465908,6.07603e+37,0.011494,7569.0,9.327379,-13.815511,1.0,1000000.0,0.0,0.0
4,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,...,4.465908,6.07603e+37,0.011494,7569.0,9.327379,-13.815511,1.0,1000000.0,0.0,0.0


In [33]:
important_features_five = feat_importance['Feature'].head(5).tolist()
important_features_five

['MagpieData mode CovalentRadius',
 'MagpieData maximum Electronegativity',
 'MagpieData minimum NpUnfilled',
 'MagpieData maximum NfUnfilled',
 'MagpieData range Column']

In [35]:
# 进行特征工程2
for i in range(0, 5):
    for j in range(i + 1, 5):
        i_col = important_features_five[i]
        j_col = important_features_five[j]
        # 计算加法
        dft_train[i_col + '_' + j_col + '_add'] = dft_train[i_col] + dft_train[j_col]
        dft_test[i_col + '_' + j_col + '_add'] = dft_test[i_col] + dft_test[j_col]
        # 计算减法
        dft_train[i_col + '_' + j_col + '_sub'] = np.abs(dft_train[i_col] - dft_train[j_col])
        dft_test[i_col + '_' + j_col + '_sub'] = np.abs(dft_test[i_col] - dft_test[j_col])
        # 计算乘法
        dft_train[i_col + '_' + j_col + '_mul'] = dft_train[i_col] * dft_train[j_col]
        dft_test[i_col + '_' + j_col + '_mul'] = dft_test[i_col] * dft_test[j_col]
        # 计算除法
        dft_train[i_col + '_' + j_col + '_div'] = dft_train[i_col] / (dft_train[j_col] + 1e-6)
        dft_test[i_col + '_' + j_col + '_div'] = dft_test[i_col] / (dft_test[j_col] + 1e-6)

In [36]:
dft_train.head(5)

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,MagpieData minimum NpUnfilled_MagpieData maximum NfUnfilled_mul,MagpieData minimum NpUnfilled_MagpieData maximum NfUnfilled_div,MagpieData minimum NpUnfilled_MagpieData range Column_add,MagpieData minimum NpUnfilled_MagpieData range Column_sub,MagpieData minimum NpUnfilled_MagpieData range Column_mul,MagpieData minimum NpUnfilled_MagpieData range Column_div,MagpieData maximum NfUnfilled_MagpieData range Column_add,MagpieData maximum NfUnfilled_MagpieData range Column_sub,MagpieData maximum NfUnfilled_MagpieData range Column_mul,MagpieData maximum NfUnfilled_MagpieData range Column_div
0,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,...,0.0,0.0,13.0,13.0,0.0,0.0,13.0,13.0,0.0,0.0
1,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,...,0.0,0.0,13.0,13.0,0.0,0.0,13.0,13.0,0.0,0.0
2,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,...,0.0,0.0,13.0,13.0,0.0,0.0,13.0,13.0,0.0,0.0
3,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,...,0.0,0.0,13.0,13.0,0.0,0.0,13.0,13.0,0.0,0.0
4,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,...,0.0,0.0,13.0,13.0,0.0,0.0,13.0,13.0,0.0,0.0


In [37]:
dft_test.head(5)

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,MagpieData minimum NpUnfilled_MagpieData maximum NfUnfilled_mul,MagpieData minimum NpUnfilled_MagpieData maximum NfUnfilled_div,MagpieData minimum NpUnfilled_MagpieData range Column_add,MagpieData minimum NpUnfilled_MagpieData range Column_sub,MagpieData minimum NpUnfilled_MagpieData range Column_mul,MagpieData minimum NpUnfilled_MagpieData range Column_div,MagpieData maximum NfUnfilled_MagpieData range Column_add,MagpieData maximum NfUnfilled_MagpieData range Column_sub,MagpieData maximum NfUnfilled_MagpieData range Column_mul,MagpieData maximum NfUnfilled_MagpieData range Column_div
0,Ac1H3,4,0,225,49.065716,7.784734,12.266429,6.594725,-20.422506,-0.671802,...,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,0.0,0.0
1,Ac2N2,4,2,186,99.880235,8.013618,24.970059,5.276132,-40.205678,-1.322563,...,0.0,0.0,12.0,12.0,0.0,0.0,12.0,12.0,0.0,0.0
2,Ag4Ge2Pb2S8,16,3,40,364.725495,5.680462,22.795343,3.729416,-4.371751,-0.549878,...,0.0,0.0,5.0,5.0,0.0,0.0,5.0,5.0,0.0,0.0
3,Ag4Hg2S2I4,12,3,36,397.001425,5.874191,33.083452,1.311221,-2.638433,-0.441444,...,0.0,0.0,6.0,6.0,0.0,0.0,6.0,6.0,0.0,0.0
4,Ag4P4Pd2O14,24,4,15,346.044808,4.761193,14.418534,1.535796,-6.480136,-1.785767,...,0.0,0.0,6.0,6.0,0.0,0.0,6.0,6.0,0.0,0.0


In [39]:
# 保存数据
train_save_dir = os.path.join(current_path, 'Data', 'train_data')
test_save_dir = os.path.join(current_path, 'Data', 'test_data')
dft_train.to_csv(os.path.join(train_save_dir, 'mp_plus_engineered_train_set.csv'), index=False)
dft_test.to_csv(os.path.join(test_save_dir, 'mp_plus_engineered_test_set.csv'), index=False)