# Generate features by composition 

In [2]:
import pandas as pd

# get data of mp/exp from the file 
mp_df = pd.read_csv("./Data/composition_data/mp_band_gap.csv")
exp_df = pd.read_csv("./Data/composition_data/exp_band_gap.csv")

In [3]:
mp_df.shape, exp_df.shape

((10342, 2), (3130, 2))

## Feature Engineering

### 1. 基于元素组成进行特征化

In [4]:
# Change the name of composition column
mp_df.rename(columns={'composition': 'composition_str'}, inplace=True)
exp_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [5]:
mp_df.head()

Unnamed: 0,composition_str,band_gap
0,Ac16S24,2.2962
1,Ac1Ag1Te2,0.0794
2,Ac1Cr1O3,2.0031
3,Ac1Fe1O3,0.9888
4,Ac1Ga1O3,2.8959


In [6]:
# transform the composition_str column to composition column
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
mp_df = stc.featurize_dataframe(mp_df, col_id="composition_str", pbar=True)

StrToComposition:   0%|          | 0/10342 [00:00<?, ?it/s]

In [7]:
exp_df = stc.featurize_dataframe(exp_df, col_id="composition_str", pbar=True)

StrToComposition:   0%|          | 0/3130 [00:00<?, ?it/s]

In [8]:
mp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Ac16S24,2.2962,"(Ac, S)"
1,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)"
2,Ac1Cr1O3,2.0031,"(Ac, Cr, O)"
3,Ac1Fe1O3,0.9888,"(Ac, Fe, O)"
4,Ac1Ga1O3,2.8959,"(Ac, Ga, O)"


In [9]:
exp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Hg0.7Cd0.3Te1,0.35,"(Hg, Cd, Te)"
1,Lu1P1,1.3,"(Lu, P)"
2,Cu3Sb1Se4,0.4,"(Cu, Sb, Se)"
3,Pt1Sb2,0.08,"(Pt, Sb)"
4,Zn1In2S4,2.68,"(Zn, In, S)"


In [10]:
# composition-based features
from matminer.featurizers.composition import ElementProperty
# 某些元素不存在"BulkModulus1"特征，我们不使用该特征
# 总共43个特征，6个统计量， 即43*6=258个特征
features=["Number","AtomicRadius","AtomicVolume","AtomicWeight","MolarVolume","SpaceGroupNumber","CovalentRadius","Density",
          "n_ws^third","Column","Electronegativity","ElectronAffinity","FirstIonizationEnergy","phi",
          "DipolePolarizability","Polarizability","BoilingT","MeltingT","ThermalConductivity","LogThermalConductivity",
          "HeatVaporization","HeatFusion","FusionEnthalpy","GSbandgap","GSenergy_pa","GSestBCClatcnt","GSestFCClatcnt","GSvolume_pa",
          "GSmagmom","NUnfilled","NValence","NdUnfilled","NdValence","NfUnfilled","NfValence","NpUnfilled","NpValence","NsUnfilled",
          "NsValence","ZungerPP-r_p","ZungerPP-r_pi","ZungerPP-r_s","ZungerPP-r_sigma"]
stats = ["minimum", "maximum", "mean", "avg_dev", "range", "mode"] 

featurizer = ElementProperty(data_source='magpie', features=features, stats=stats) 	# extracting features from a predefined database， e.g. magpie, jarvis, etc.
mp_featurized_df = featurizer.featurize_dataframe(mp_df, col_id='composition')

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


ElementProperty:   0%|          | 0/10342 [00:00<?, ?it/s]

In [11]:
mp_featurized_df.head()

Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData range Number,MagpieData mode Number,MagpieData minimum AtomicRadius,...,MagpieData mean ZungerPP-r_s,MagpieData avg_dev ZungerPP-r_s,MagpieData range ZungerPP-r_s,MagpieData mode ZungerPP-r_s,MagpieData minimum ZungerPP-r_sigma,MagpieData maximum ZungerPP-r_sigma,MagpieData mean ZungerPP-r_sigma,MagpieData avg_dev ZungerPP-r_sigma,MagpieData range ZungerPP-r_sigma,MagpieData mode ZungerPP-r_sigma
0,Ac16S24,2.2962,"(Ac, S)",16.0,89.0,45.2,35.04,73.0,16.0,1.0,...,,,,0.54,1.1,3.12,1.908,0.9696,2.02,1.1
1,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)",47.0,89.0,60.0,14.5,42.0,52.0,1.4,...,,,,0.79,1.67,3.12,2.20875,0.53875,1.45,1.67
2,Ac1Cr1O3,2.0031,"(Ac, Cr, O)",8.0,89.0,27.4,24.64,81.0,8.0,0.6,...,,,,0.285,0.465,3.12,1.391,1.1112,2.655,0.465
3,Ac1Fe1O3,0.9888,"(Ac, Fe, O)",8.0,89.0,27.8,24.48,81.0,8.0,0.6,...,,,,0.285,0.465,3.12,1.325,1.032,2.655,0.465
4,Ac1Ga1O3,2.8959,"(Ac, Ga, O)",8.0,89.0,28.8,24.96,81.0,8.0,0.6,...,,,,0.285,0.465,3.12,1.242,0.9324,2.655,0.465


In [12]:
exp_featurized_df = featurizer.featurize_dataframe(exp_df, col_id='composition')
exp_featurized_df.head()

ElementProperty:   0%|          | 0/3130 [00:00<?, ?it/s]

Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData range Number,MagpieData mode Number,MagpieData minimum AtomicRadius,...,MagpieData mean ZungerPP-r_s,MagpieData avg_dev ZungerPP-r_s,MagpieData range ZungerPP-r_s,MagpieData mode ZungerPP-r_s,MagpieData minimum ZungerPP-r_sigma,MagpieData maximum ZungerPP-r_sigma,MagpieData mean ZungerPP-r_sigma,MagpieData avg_dev ZungerPP-r_sigma,MagpieData range ZungerPP-r_sigma,MagpieData mode ZungerPP-r_sigma
0,Hg0.7Cd0.3Te1,0.35,"(Hg, Cd, Te)",48.0,80.0,61.2,13.16,32.0,52.0,1.4,...,0.91725,0.12725,0.28,0.79,1.67,2.41,2.01075,0.34075,0.74,1.67
1,Lu1P1,1.3,"(Lu, P)",15.0,71.0,43.0,28.0,56.0,15.0,1.0,...,,,,,1.24,3.37,2.305,1.065,2.13,1.24
2,Cu3Sb1Se4,0.4,"(Cu, Sb, Se)",29.0,51.0,34.25,4.1875,22.0,34.0,1.15,...,0.74125,0.12625,0.265,0.615,1.285,2.04,1.628125,0.343125,0.755,1.285
3,Pt1Sb2,0.08,"(Pt, Sb)",51.0,78.0,60.0,12.0,27.0,51.0,1.35,...,0.966667,0.182222,0.41,0.83,1.765,2.7,2.076667,0.415556,0.935,1.765
4,Zn1In2S4,2.68,"(Zn, In, S)",16.0,49.0,27.428571,13.061224,33.0,16.0,1.0,...,0.694286,0.176327,0.4,0.54,1.1,2.05,1.482857,0.437551,0.95,1.1


#### Nan值处理

In [13]:
# 查看Nan值超过10%的特征
mp_nan = mp_featurized_df.isna().sum() / len(mp_featurized_df)
mp_nan = mp_nan[mp_nan > 0.1]
mp_nan = mp_nan.index.tolist()
mp_nan

['MagpieData minimum n_ws^third',
 'MagpieData maximum n_ws^third',
 'MagpieData mean n_ws^third',
 'MagpieData avg_dev n_ws^third',
 'MagpieData range n_ws^third',
 'MagpieData mode n_ws^third',
 'MagpieData minimum phi',
 'MagpieData maximum phi',
 'MagpieData mean phi',
 'MagpieData avg_dev phi',
 'MagpieData range phi',
 'MagpieData mode phi',
 'MagpieData minimum ZungerPP-r_p',
 'MagpieData maximum ZungerPP-r_p',
 'MagpieData mean ZungerPP-r_p',
 'MagpieData avg_dev ZungerPP-r_p',
 'MagpieData range ZungerPP-r_p',
 'MagpieData minimum ZungerPP-r_pi',
 'MagpieData maximum ZungerPP-r_pi',
 'MagpieData mean ZungerPP-r_pi',
 'MagpieData avg_dev ZungerPP-r_pi',
 'MagpieData range ZungerPP-r_pi',
 'MagpieData minimum ZungerPP-r_s',
 'MagpieData maximum ZungerPP-r_s',
 'MagpieData mean ZungerPP-r_s',
 'MagpieData avg_dev ZungerPP-r_s',
 'MagpieData range ZungerPP-r_s']

In [14]:
# exp查看Nan值超过20%的特征
exp_nan = exp_featurized_df.isna().sum() / len(exp_featurized_df)
exp_nan = exp_nan[exp_nan > 0.1]
exp_nan = exp_nan.index.tolist()
exp_nan

['MagpieData minimum n_ws^third',
 'MagpieData maximum n_ws^third',
 'MagpieData mean n_ws^third',
 'MagpieData avg_dev n_ws^third',
 'MagpieData range n_ws^third',
 'MagpieData mode n_ws^third',
 'MagpieData minimum phi',
 'MagpieData maximum phi',
 'MagpieData mean phi',
 'MagpieData avg_dev phi',
 'MagpieData range phi',
 'MagpieData mode phi',
 'MagpieData minimum ZungerPP-r_p',
 'MagpieData maximum ZungerPP-r_p',
 'MagpieData mean ZungerPP-r_p',
 'MagpieData avg_dev ZungerPP-r_p',
 'MagpieData range ZungerPP-r_p',
 'MagpieData minimum ZungerPP-r_pi',
 'MagpieData maximum ZungerPP-r_pi',
 'MagpieData mean ZungerPP-r_pi',
 'MagpieData avg_dev ZungerPP-r_pi',
 'MagpieData range ZungerPP-r_pi',
 'MagpieData minimum ZungerPP-r_s',
 'MagpieData maximum ZungerPP-r_s',
 'MagpieData mean ZungerPP-r_s',
 'MagpieData avg_dev ZungerPP-r_s',
 'MagpieData range ZungerPP-r_s']

In [15]:
# n_ws^third/phi/ZungerPP-r_p/ZungerPP-r_pi/ZungerPP-r_s特征Nan值超过0.1，我们也不使用这5个特征
# 重新提取特征
# composition-based features
from matminer.featurizers.composition import ElementProperty
# 某些元素不存在"BulkModulus1"特征，我们不使用该特征
# 总共38个特征，6个统计量， 即38*6=228个特征
features=["Number","AtomicRadius","AtomicVolume","AtomicWeight","MolarVolume","SpaceGroupNumber","CovalentRadius","Density",
          "Column","Electronegativity","ElectronAffinity","FirstIonizationEnergy",
          "DipolePolarizability","Polarizability","BoilingT","MeltingT","ThermalConductivity","LogThermalConductivity",
          "HeatVaporization","HeatFusion","FusionEnthalpy","GSbandgap","GSenergy_pa","GSestBCClatcnt","GSestFCClatcnt","GSvolume_pa",
          "GSmagmom","NUnfilled","NValence","NdUnfilled","NdValence","NfUnfilled","NfValence","NpUnfilled","NpValence","NsUnfilled",
          "NsValence","ZungerPP-r_sigma"]
stats = ["minimum", "maximum", "mean", "avg_dev", "range", "mode"] 

featurizer = ElementProperty(data_source='magpie', features=features, stats=stats) 	# extracting features from a predefined database， e.g. magpie, jarvis, etc.
mp_featurized_df = featurizer.featurize_dataframe(mp_df, col_id='composition')
exp_featurized_df = featurizer.featurize_dataframe(exp_df, col_id='composition')
mp_featurized_df.shape, exp_featurized_df.shape

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


ElementProperty:   0%|          | 0/10342 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/3130 [00:00<?, ?it/s]

((10342, 231), (3130, 231))

In [16]:
# 查看存在Nan值的列
mp_nan = mp_featurized_df.isna().sum() / len(mp_featurized_df)
mp_nan = mp_nan[mp_nan > 0]
# mp_nan = mp_nan.index.tolist()
mp_nan

MagpieData minimum AtomicRadius        0.005898
MagpieData maximum AtomicRadius        0.005898
MagpieData mean AtomicRadius           0.005898
MagpieData avg_dev AtomicRadius        0.005898
MagpieData range AtomicRadius          0.005898
MagpieData minimum ElectronAffinity    0.042738
MagpieData maximum ElectronAffinity    0.042738
MagpieData mean ElectronAffinity       0.042738
MagpieData avg_dev ElectronAffinity    0.042738
MagpieData range ElectronAffinity      0.042738
MagpieData mode ElectronAffinity       0.001644
MagpieData minimum HeatFusion          0.000774
MagpieData maximum HeatFusion          0.000774
MagpieData mean HeatFusion             0.000774
MagpieData avg_dev HeatFusion          0.000774
MagpieData range HeatFusion            0.000774
MagpieData minimum FusionEnthalpy      0.000774
MagpieData maximum FusionEnthalpy      0.000774
MagpieData mean FusionEnthalpy         0.000774
MagpieData avg_dev FusionEnthalpy      0.000774
MagpieData range FusionEnthalpy        0

In [17]:
# 查看存在Nan值的行数
mp_na_row = mp_featurized_df.isna().sum(axis=1)
mp_na_row = mp_na_row[mp_na_row > 0]
exp_na_row = exp_featurized_df.isna().sum(axis=1)
exp_na_row = exp_na_row[exp_na_row > 0]
len(mp_na_row), len(exp_na_row)

(510, 21)

In [18]:
# 去掉所有存在Nan值的行
mp_featurized_df = mp_featurized_df.dropna()
exp_featurized_df = exp_featurized_df.dropna()
mp_featurized_df.shape, exp_featurized_df.shape

((9832, 231), (3109, 231))

In [19]:
# 再检查是否存在Nan值
mp_featurized_df.isna().sum().sum(), exp_featurized_df.isna().sum().sum()

(np.int64(0), np.int64(0))

In [20]:
mp_featurized_df.drop(columns=['composition'], inplace=True)    
mp_featurized_df.rename(columns={'composition_str': 'composition'}, inplace=True)
mp_featurized_df.head()

Unnamed: 0,composition,band_gap,MagpieData minimum Number,MagpieData maximum Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData range Number,MagpieData mode Number,MagpieData minimum AtomicRadius,MagpieData maximum AtomicRadius,...,MagpieData mean NsValence,MagpieData avg_dev NsValence,MagpieData range NsValence,MagpieData mode NsValence,MagpieData minimum ZungerPP-r_sigma,MagpieData maximum ZungerPP-r_sigma,MagpieData mean ZungerPP-r_sigma,MagpieData avg_dev ZungerPP-r_sigma,MagpieData range ZungerPP-r_sigma,MagpieData mode ZungerPP-r_sigma
11,Ag2Au4F16,0.2585,9.0,79.0,25.181818,23.53719,70.0,9.0,0.5,1.6,...,1.727273,0.396694,1.0,2.0,0.405,2.66,0.994091,0.85686,2.255,0.405
12,Ag8B48Cl48,2.7028,5.0,47.0,13.769231,8.094675,42.0,5.0,0.85,1.6,...,1.923077,0.142012,1.0,2.0,0.795,2.375,1.015769,0.209112,1.58,0.795
13,Ag40Te16Br12,0.8722,35.0,52.0,46.058824,3.903114,17.0,47.0,1.15,1.6,...,1.411765,0.484429,1.0,1.0,1.2,2.375,2.001765,0.4391,1.175,2.375
14,Ag30P8S32Cl6,1.2888,15.0,47.0,28.210526,14.833795,32.0,16.0,1.0,1.6,...,1.605263,0.477839,1.0,2.0,1.01,2.375,1.610921,0.60322,1.365,1.1
15,Ag8Bi4O12,0.3872,8.0,83.0,33.5,25.5,75.0,8.0,0.6,1.6,...,1.666667,0.444444,1.0,2.0,0.465,2.375,1.357,0.892,1.91,0.465


In [21]:
exp_featurized_df.drop(columns=['composition'], inplace=True)
exp_featurized_df.rename(columns={'composition_str': 'composition'}, inplace=True)
exp_featurized_df.head()

Unnamed: 0,composition,band_gap,MagpieData minimum Number,MagpieData maximum Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData range Number,MagpieData mode Number,MagpieData minimum AtomicRadius,MagpieData maximum AtomicRadius,...,MagpieData mean NsValence,MagpieData avg_dev NsValence,MagpieData range NsValence,MagpieData mode NsValence,MagpieData minimum ZungerPP-r_sigma,MagpieData maximum ZungerPP-r_sigma,MagpieData mean ZungerPP-r_sigma,MagpieData avg_dev ZungerPP-r_sigma,MagpieData range ZungerPP-r_sigma,MagpieData mode ZungerPP-r_sigma
0,Hg0.7Cd0.3Te1,0.35,48.0,80.0,61.2,13.16,32.0,52.0,1.4,1.55,...,2.0,0.0,0.0,2.0,1.67,2.41,2.01075,0.34075,0.74,1.67
1,Lu1P1,1.3,15.0,71.0,43.0,28.0,56.0,15.0,1.0,1.75,...,2.0,0.0,0.0,2.0,1.24,3.37,2.305,1.065,2.13,1.24
2,Cu3Sb1Se4,0.4,29.0,51.0,34.25,4.1875,22.0,34.0,1.15,1.45,...,1.625,0.46875,1.0,2.0,1.285,2.04,1.628125,0.343125,0.755,1.285
3,Pt1Sb2,0.08,51.0,78.0,60.0,12.0,27.0,51.0,1.35,1.45,...,1.666667,0.444444,1.0,2.0,1.765,2.7,2.076667,0.415556,0.935,1.765
4,Zn1In2S4,2.68,16.0,49.0,27.428571,13.061224,33.0,16.0,1.0,1.55,...,2.0,0.0,0.0,2.0,1.1,2.05,1.482857,0.437551,0.95,1.1


In [22]:
# 将列名去掉MagpieData
mp_featurized_df.columns = mp_featurized_df.columns.str.replace('MagpieData ', '')
exp_featurized_df.columns = exp_featurized_df.columns.str.replace('MagpieData ', '')
mp_featurized_df.head()

Unnamed: 0,composition,band_gap,minimum Number,maximum Number,mean Number,avg_dev Number,range Number,mode Number,minimum AtomicRadius,maximum AtomicRadius,...,mean NsValence,avg_dev NsValence,range NsValence,mode NsValence,minimum ZungerPP-r_sigma,maximum ZungerPP-r_sigma,mean ZungerPP-r_sigma,avg_dev ZungerPP-r_sigma,range ZungerPP-r_sigma,mode ZungerPP-r_sigma
11,Ag2Au4F16,0.2585,9.0,79.0,25.181818,23.53719,70.0,9.0,0.5,1.6,...,1.727273,0.396694,1.0,2.0,0.405,2.66,0.994091,0.85686,2.255,0.405
12,Ag8B48Cl48,2.7028,5.0,47.0,13.769231,8.094675,42.0,5.0,0.85,1.6,...,1.923077,0.142012,1.0,2.0,0.795,2.375,1.015769,0.209112,1.58,0.795
13,Ag40Te16Br12,0.8722,35.0,52.0,46.058824,3.903114,17.0,47.0,1.15,1.6,...,1.411765,0.484429,1.0,1.0,1.2,2.375,2.001765,0.4391,1.175,2.375
14,Ag30P8S32Cl6,1.2888,15.0,47.0,28.210526,14.833795,32.0,16.0,1.0,1.6,...,1.605263,0.477839,1.0,2.0,1.01,2.375,1.610921,0.60322,1.365,1.1
15,Ag8Bi4O12,0.3872,8.0,83.0,33.5,25.5,75.0,8.0,0.6,1.6,...,1.666667,0.444444,1.0,2.0,0.465,2.375,1.357,0.892,1.91,0.465


In [23]:
exp_featurized_df.head()

Unnamed: 0,composition,band_gap,minimum Number,maximum Number,mean Number,avg_dev Number,range Number,mode Number,minimum AtomicRadius,maximum AtomicRadius,...,mean NsValence,avg_dev NsValence,range NsValence,mode NsValence,minimum ZungerPP-r_sigma,maximum ZungerPP-r_sigma,mean ZungerPP-r_sigma,avg_dev ZungerPP-r_sigma,range ZungerPP-r_sigma,mode ZungerPP-r_sigma
0,Hg0.7Cd0.3Te1,0.35,48.0,80.0,61.2,13.16,32.0,52.0,1.4,1.55,...,2.0,0.0,0.0,2.0,1.67,2.41,2.01075,0.34075,0.74,1.67
1,Lu1P1,1.3,15.0,71.0,43.0,28.0,56.0,15.0,1.0,1.75,...,2.0,0.0,0.0,2.0,1.24,3.37,2.305,1.065,2.13,1.24
2,Cu3Sb1Se4,0.4,29.0,51.0,34.25,4.1875,22.0,34.0,1.15,1.45,...,1.625,0.46875,1.0,2.0,1.285,2.04,1.628125,0.343125,0.755,1.285
3,Pt1Sb2,0.08,51.0,78.0,60.0,12.0,27.0,51.0,1.35,1.45,...,1.666667,0.444444,1.0,2.0,1.765,2.7,2.076667,0.415556,0.935,1.765
4,Zn1In2S4,2.68,16.0,49.0,27.428571,13.061224,33.0,16.0,1.0,1.55,...,2.0,0.0,0.0,2.0,1.1,2.05,1.482857,0.437551,0.95,1.1


### 2. MP数据加入其他物理特征

In [24]:
# load the data
mp_plus_df = pd.read_csv('./Data/plus_data/mp_band_gap_plus.csv')

In [25]:
mp_plus_df.head()

Unnamed: 0,material_id,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,is_gap_direct,band_gap,is_oxide
0,mp-32800,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,0.0,2.2962,0.0
1,mp-867311,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,0.0,0.0794,0.0
2,mp-866101,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,0.0,2.0031,1.0
3,mp-861502,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,0.0,0.9888,1.0
4,mp-1183053,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,0.0,2.8959,1.0


In [26]:
mp_plus_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [27]:
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
mp_plus_df = stc.featurize_dataframe(mp_plus_df, col_id="composition_str", pbar=True)

StrToComposition:   0%|          | 0/10342 [00:00<?, ?it/s]

In [28]:
# composition-based features
mp_featurized_df2 = featurizer.featurize_dataframe(mp_plus_df, col_id='composition')

ElementProperty:   0%|          | 0/10342 [00:00<?, ?it/s]

In [29]:
mp_featurized_df2.head()    

Unnamed: 0,material_id,composition_str,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,...,MagpieData mean NsValence,MagpieData avg_dev NsValence,MagpieData range NsValence,MagpieData mode NsValence,MagpieData minimum ZungerPP-r_sigma,MagpieData maximum ZungerPP-r_sigma,MagpieData mean ZungerPP-r_sigma,MagpieData avg_dev ZungerPP-r_sigma,MagpieData range ZungerPP-r_sigma,MagpieData mode ZungerPP-r_sigma
0,mp-32800,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,...,2.0,0.0,0.0,2.0,1.1,3.12,1.908,0.9696,2.02,1.1
1,mp-867311,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,...,1.75,0.375,1.0,2.0,1.67,3.12,2.20875,0.53875,1.45,1.67
2,mp-866101,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,...,1.8,0.32,1.0,2.0,0.465,3.12,1.391,1.1112,2.655,0.465
3,mp-861502,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,...,2.0,0.0,0.0,2.0,0.465,3.12,1.325,1.032,2.655,0.465
4,mp-1183053,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,...,2.0,0.0,0.0,2.0,0.465,3.12,1.242,0.9324,2.655,0.465


In [30]:
# 查看存在Nan值的行数
mp_na_row = mp_featurized_df2.isna().sum(axis=1)
mp_na_row = mp_na_row[mp_na_row > 0]
len(mp_na_row)

510

In [31]:
# 去掉所有存在Nan值的行
mp_featurized_df2 = mp_featurized_df2.dropna()
mp_featurized_df2.shape

(9832, 243)

In [32]:
mp_featurized_df2['composition_str'] = mp_featurized_df2['composition']
mp_featurized_df2.drop(columns=['composition'], inplace=True)    
mp_featurized_df2.rename(columns={'composition_str': 'composition'}, inplace=True)
mp_featurized_df2['composition'] = mp_featurized_df2['composition'].apply(lambda x: str(x).replace(" ", ""))
mp_featurized_df2.head()

Unnamed: 0,material_id,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,...,MagpieData mean NsValence,MagpieData avg_dev NsValence,MagpieData range NsValence,MagpieData mode NsValence,MagpieData minimum ZungerPP-r_sigma,MagpieData maximum ZungerPP-r_sigma,MagpieData mean ZungerPP-r_sigma,MagpieData avg_dev ZungerPP-r_sigma,MagpieData range ZungerPP-r_sigma,MagpieData mode ZungerPP-r_sigma
11,mp-18125,Ag2Au4F16,22,4,14,312.794323,6.941567,14.217924,-1.966392,-14.614052,...,1.727273,0.396694,1.0,2.0,0.405,2.66,0.994091,0.85686,2.255,0.405
12,mp-567334,Ag8B48Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,...,1.923077,0.142012,1.0,2.0,0.795,2.375,1.015769,0.209112,1.58,0.795
13,mp-568392,Ag40Te16Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,...,1.411765,0.484429,1.0,1.0,1.2,2.375,2.001765,0.4391,1.175,2.375
14,mp-560328,Ag30P8S32Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,...,1.605263,0.477839,1.0,2.0,1.01,2.375,1.610921,0.60322,1.365,1.1
15,mp-23558,Ag8Bi4O12,24,3,34,366.571761,8.565436,15.273823,3.0267,-4.987277,...,1.666667,0.444444,1.0,2.0,0.465,2.375,1.357,0.892,1.91,0.465


In [33]:
mp_featurized_df2.drop(columns=['material_id'], inplace=True)

In [34]:
mp_featurized_df2.columns = mp_featurized_df2.columns.str.replace('MagpieData ', '')
mp_featurized_df2.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,mean NsValence,avg_dev NsValence,range NsValence,mode NsValence,minimum ZungerPP-r_sigma,maximum ZungerPP-r_sigma,mean ZungerPP-r_sigma,avg_dev ZungerPP-r_sigma,range ZungerPP-r_sigma,mode ZungerPP-r_sigma
11,Ag2Au4F16,22,4,14,312.794323,6.941567,14.217924,-1.966392,-14.614052,-1.201868,...,1.727273,0.396694,1.0,2.0,0.405,2.66,0.994091,0.85686,2.255,0.405
12,Ag8B48Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,-0.736951,...,1.923077,0.142012,1.0,2.0,0.795,2.375,1.015769,0.209112,1.58,0.795
13,Ag40Te16Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,-0.366432,...,1.411765,0.484429,1.0,1.0,1.2,2.375,2.001765,0.4391,1.175,2.375
14,Ag30P8S32Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,-0.481548,...,1.605263,0.477839,1.0,2.0,1.01,2.375,1.610921,0.60322,1.365,1.1
15,Ag8Bi4O12,24,3,34,366.571761,8.565436,15.273823,3.0267,-4.987277,-0.917311,...,1.666667,0.444444,1.0,2.0,0.465,2.375,1.357,0.892,1.91,0.465


## Save Data

In [None]:
# save the data to csv file
import os
file_path = os.path.join(os.getcwd(), "./Data/featured_data")
os.makedirs(file_path, exist_ok=True)
mp_featurized_df.to_csv(os.path.join(file_path, 'mp_elements_feat.csv'), index=False)
exp_featurized_df.to_csv(os.path.join(file_path, 'exp_elements_feat.csv'), index=False)
mp_featurized_df2.to_csv(os.path.join(file_path, 'mp_elements_plus_feat.csv'), index=False)