# Feature Engineering
对影响mp_plus最重要的**十个特征X**进行如下操作：
- $X^2$
- $exp(X)$
- $log(|X|)$
- $1/X$
- $\sqrt{|X|}$

其中操作四五需要注意的是，$X$不能为0，因此在操作之前需要对$X$进行处理，将$X=0$的值替换为1e-6。

对影响mp_plus最重要的**五个特征Y**进行如下操作：
- $Y_i + Y_j$ (i,j = 1,2,3,4,5)
- $|Y_i - Y_j|$ (i,j = 1,2,3,4,5)
- $Y_i * Y_j$ (i,j = 1,2,3,4,5)
- $Y_i / Y_j$ (i,j = 1,2,3,4,5)

总共增加了$10*5 + 10*4 = 90$个特征。

In [12]:
# import libraries
import os
import sys
import numpy as np
import pandas as pd
# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [13]:
current_path = os.getcwd()
current_path

'c:\\Users\\Fortyfour\\Desktop\\graduation_design\\New_Scripts\\Bg_pre2'

In [14]:
# Load data
file_dir = os.path.join(current_path, './Data')
# dft data
dft_train = pd.read_csv(os.path.join(file_dir, 'rfe_train_data', 'mp_plus_train_set.csv'))
dft_test = pd.read_csv(os.path.join(file_dir, 'rfe_test_data', 'mp_plus_test_set.csv'))

# 
feat_importance_dir = os.path.join(current_path, 'metrics', 'feature_importance', 'mp_plus')
feat_importance = pd.read_csv(os.path.join(feat_importance_dir, 'plus_XGBoost_feature_importance.csv'))

In [15]:
(dft_train[feat_importance['Feature']]==0).any()

mode ThermalConductivity       False
mode HeatVaporization          False
maximum Electronegativity      False
minimum HeatVaporization       False
minimum ThermalConductivity    False
range NfUnfilled                True
minimum GSestBCClatcnt         False
minimum AtomicVolume           False
minimum MolarVolume            False
maximum NUnfilled               True
dtype: bool

In [16]:
# 进行特征工程1
for col in feat_importance['Feature']:
    # 计算对数值
    dft_train[col + '_log'] = np.log(np.abs(dft_train[col])+1e-6)  # 加1e-6避免对数为负无穷
    dft_test[col + '_log'] = np.log(np.abs(dft_test[col])+1e-6) 
    # 计算指数值
    # 归一化避免指数值过大
    dft_train[col + '_exp'] = np.exp(dft_train[col] / np.abs(dft_train[col]).max())
    dft_test[col + '_exp'] = np.exp(dft_test[col] / np.abs(dft_test[col]).max())
    # 计算倒数值
    dft_train[col + '_reciprocal'] = 1 / (dft_train[col]+1e-6)  # 加1e-6避免除数为0
    dft_test[col + '_reciprocal'] = 1 / (dft_test[col]+1e-6)
    # 计算平方值
    dft_train[col + '_square'] = dft_train[col] ** 2
    dft_test[col + '_square'] = dft_test[col] ** 2
    # 计算平方根值
    dft_train[col + '_sqrt'] = np.sqrt(np.abs(dft_train[col]))  # 加1e-6避免开方为负无穷
    dft_test[col + '_sqrt'] = np.sqrt(np.abs(dft_test[col]))

In [17]:
dft_train.head(5)

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,minimum MolarVolume_log,minimum MolarVolume_exp,minimum MolarVolume_reciprocal,minimum MolarVolume_square,minimum MolarVolume_sqrt,maximum NUnfilled_log,maximum NUnfilled_exp,maximum NUnfilled_reciprocal,maximum NUnfilled_square,maximum NUnfilled_sqrt
0,Ag2Au4F16,22,4,14,312.794323,6.941567,14.217924,-1.966392,-14.614052,-1.201868,...,2.323368,1.487307,0.097943,104.2441,3.195309,9.999995e-07,1.046503,0.999999,1.0,1.0
1,Ag8B48Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,-0.736951,...,1.479329,1.186116,0.22779,19.2721,2.095233,1.609438,1.255172,0.2,25.0,2.236068
2,Ag40Te16Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,-0.366432,...,2.329227,1.490781,0.097371,105.4729,3.204684,0.6931477,1.095169,0.5,4.0,1.414214
3,Ag30P8S32Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,-0.481548,...,2.329227,1.490781,0.097371,105.4729,3.204684,1.098613,1.146099,0.333333,9.0,1.732051
4,Ag8Bi4O12,24,3,34,366.571761,8.565436,15.273823,3.0267,-4.987277,-0.917311,...,2.329227,1.490781,0.097371,105.4729,3.204684,1.098613,1.146099,0.333333,9.0,1.732051


In [18]:
important_features_five = feat_importance['Feature'].head(5).tolist()
important_features_five

['mode ThermalConductivity',
 'mode HeatVaporization',
 'maximum Electronegativity',
 'minimum HeatVaporization',
 'minimum ThermalConductivity']

In [19]:
# 进行特征工程2
for i in range(0, 5):
    for j in range(i + 1, 5):
        i_col = important_features_five[i]
        j_col = important_features_five[j]
        # 计算加法
        dft_train[i_col + '_' + j_col + '_add'] = dft_train[i_col] + dft_train[j_col]
        dft_test[i_col + '_' + j_col + '_add'] = dft_test[i_col] + dft_test[j_col]
        # 计算减法
        dft_train[i_col + '_' + j_col + '_sub'] = np.abs(dft_train[i_col] - dft_train[j_col])
        dft_test[i_col + '_' + j_col + '_sub'] = np.abs(dft_test[i_col] - dft_test[j_col])
        # 计算乘法
        dft_train[i_col + '_' + j_col + '_mul'] = dft_train[i_col] * dft_train[j_col]
        dft_test[i_col + '_' + j_col + '_mul'] = dft_test[i_col] * dft_test[j_col]
        # 计算除法
        dft_train[i_col + '_' + j_col + '_div'] = dft_train[i_col] / (dft_train[j_col] + 1e-6)
        dft_test[i_col + '_' + j_col + '_div'] = dft_test[i_col] / (dft_test[j_col] + 1e-6)

In [20]:
dft_train.head(5)

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,maximum Electronegativity_minimum HeatVaporization_mul,maximum Electronegativity_minimum HeatVaporization_div,maximum Electronegativity_minimum ThermalConductivity_add,maximum Electronegativity_minimum ThermalConductivity_sub,maximum Electronegativity_minimum ThermalConductivity_mul,maximum Electronegativity_minimum ThermalConductivity_div,minimum HeatVaporization_minimum ThermalConductivity_add,minimum HeatVaporization_minimum ThermalConductivity_sub,minimum HeatVaporization_minimum ThermalConductivity_mul,minimum HeatVaporization_minimum ThermalConductivity_div
0,Ag2Au4F16,22,4,14,312.794323,6.941567,14.217924,-1.966392,-14.614052,-1.201868,...,13.0146,1.217125,4.0077,3.9523,0.110246,143.677124,3.2977,3.2423,0.090579,118.04628
1,Ag8B48Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,-0.736951,...,32.232,0.309804,3.1689,3.1511,0.028124,355.01629,10.2089,10.1911,0.09078,1145.938659
2,Ag40Te16Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,-0.366432,...,43.808,0.2,3.08,2.84,0.3552,24.666461,14.92,14.68,1.776,123.332306
3,Ag30P8S32Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,-0.481548,...,30.968,0.322449,3.1689,3.1511,0.028124,355.01629,9.8089,9.7911,0.08722,1100.999888
4,Ag8Bi4O12,24,3,34,366.571761,8.565436,15.273823,3.0267,-4.987277,-0.917311,...,11.7304,1.008797,3.46658,3.41342,0.091435,129.415748,3.43658,3.38342,0.090638,128.287122


In [21]:
dft_test.head(5)

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,maximum Electronegativity_minimum HeatVaporization_mul,maximum Electronegativity_minimum HeatVaporization_div,maximum Electronegativity_minimum ThermalConductivity_add,maximum Electronegativity_minimum ThermalConductivity_sub,maximum Electronegativity_minimum ThermalConductivity_mul,maximum Electronegativity_minimum ThermalConductivity_div,minimum HeatVaporization_minimum ThermalConductivity_add,minimum HeatVaporization_minimum ThermalConductivity_sub,minimum HeatVaporization_minimum ThermalConductivity_mul,minimum HeatVaporization_minimum ThermalConductivity_div
0,Ag12Ge6S36O126,180,6,147,2544.837686,3.197654,14.137987,0.457826,-6.23463,-1.599755,...,11.7304,1.008797,3.46658,3.41342,0.091435,129.415748,3.43658,3.38342,0.090638,128.287122
1,Ag8H16S12O48,84,3,33,1057.273255,3.191155,12.586586,0.517819,-5.785262,-1.449207,...,1.55488,7.610603,3.46658,3.41342,0.091435,129.415748,0.47858,0.42542,0.012014,17.004627
2,Ag16P8Se24,48,3,19,1108.881842,5.79337,23.101705,2.942116,-15.94416,-0.169408,...,31.62,0.205645,2.786,2.314,0.6018,10.805039,12.636,12.164,2.9264,52.54215
3,Ag12Te6O24,42,4,15,571.396405,7.102537,13.604676,3.100366,-5.053196,-0.964478,...,11.7304,1.008797,3.46658,3.41342,0.091435,129.415748,3.43658,3.38342,0.090638,128.287122
4,Ag3As1F12,16,5,2,257.772769,4.035878,16.110798,-1.540041,-4.061584,-1.801261,...,13.0146,1.217125,4.0077,3.9523,0.110246,143.677124,3.2977,3.2423,0.090579,118.04628


In [22]:
# 保存数据
train_save_dir = os.path.join(current_path, 'Data', 'rfe_train_data')
test_save_dir = os.path.join(current_path, 'Data', 'rfe_test_data')
dft_train.to_csv(os.path.join(train_save_dir, 'mp_plus_engineered_train_set.csv'), index=False)
dft_test.to_csv(os.path.join(test_save_dir, 'mp_plus_engineered_test_set.csv'), index=False)