# Generate features by composition 

In [1]:
import pandas as pd

# get data of mp/exp from the file 
mp_df = pd.read_csv("./Data/composition_data/mp_band_gap.csv")
exp_df = pd.read_csv("./Data/composition_data/exp_band_gap.csv")

In [2]:
mp_df.shape, exp_df.shape

((10342, 2), (3130, 2))

## Feature Engineering

### 1. 基于元素组成进行特征化

In [3]:
# Change the name of composition column
mp_df.rename(columns={'composition': 'composition_str'}, inplace=True)
exp_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [4]:
mp_df.head()

Unnamed: 0,composition_str,band_gap
0,Ac16S24,2.2962
1,Ac1Ag1Te2,0.0794
2,Ac1Cr1O3,2.0031
3,Ac1Fe1O3,0.9888
4,Ac1Ga1O3,2.8959


In [6]:
# transform the composition_str column to composition column
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
mp_df = stc.featurize_dataframe(mp_df, col_id="composition_str", pbar=True)

  from .autonotebook import tqdm as notebook_tqdm
StrToComposition: 100%|██████████| 10342/10342 [00:01<00:00, 5251.14it/s]



In [7]:
exp_df = stc.featurize_dataframe(exp_df, col_id="composition_str", pbar=True)

StrToComposition: 100%|██████████| 3130/3130 [00:00<00:00, 3842.44it/s] 


In [8]:
mp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Ac16S24,2.2962,"(Ac, S)"
1,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)"
2,Ac1Cr1O3,2.0031,"(Ac, Cr, O)"
3,Ac1Fe1O3,0.9888,"(Ac, Fe, O)"
4,Ac1Ga1O3,2.8959,"(Ac, Ga, O)"


In [9]:
exp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Hg0.7Cd0.3Te1,0.35,"(Hg, Cd, Te)"
1,Lu1P1,1.3,"(Lu, P)"
2,Cu3Sb1Se4,0.4,"(Cu, Sb, Se)"
3,Pt1Sb2,0.08,"(Pt, Sb)"
4,Zn1In2S4,2.68,"(Zn, In, S)"


In [10]:
# composition-based features
from matminer.featurizers.composition import ElementProperty

featurizer = ElementProperty.from_preset(preset_name='magpie') 	# extracting features from a predefined database， e.g. magpie, jarvis, etc.
mp_featurized_df = featurizer.featurize_dataframe(mp_df, col_id='composition')

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
ElementProperty:   0%|          | 0/10342 [00:00<?, ?it/s]

ElementProperty: 100%|██████████| 10342/10342 [00:17<00:00, 600.64it/s]


In [11]:
mp_featurized_df.head()

Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Ac16S24,2.2962,"(Ac, S)",16.0,89.0,73.0,45.2,35.04,16.0,14.0,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0
1,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)",47.0,89.0,42.0,60.0,14.5,52.0,14.0,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
2,Ac1Cr1O3,2.0031,"(Ac, Cr, O)",8.0,89.0,81.0,27.4,24.64,8.0,14.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
3,Ac1Fe1O3,0.9888,"(Ac, Fe, O)",8.0,89.0,81.0,27.8,24.48,8.0,14.0,...,2.110663,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0
4,Ac1Ga1O3,2.8959,"(Ac, Ga, O)",8.0,89.0,81.0,28.8,24.96,8.0,14.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,65.0,64.0,12.0


In [12]:
exp_featurized_df = featurizer.featurize_dataframe(exp_df, col_id='composition')
exp_featurized_df.head()

ElementProperty: 100%|██████████| 3130/3130 [00:05<00:00, 578.59it/s]


Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Hg0.7Cd0.3Te1,0.35,"(Hg, Cd, Te)",48.0,80.0,32.0,61.2,13.16,52.0,70.0,...,0.0,0.0,0.0,0.0,152.0,194.0,42.0,163.2,11.2,152.0
1,Lu1P1,1.3,"(Lu, P)",15.0,71.0,56.0,43.0,28.0,15.0,41.0,...,0.002247,0.001124,0.001124,0.0,2.0,194.0,192.0,98.0,96.0,2.0
2,Cu3Sb1Se4,0.4,"(Cu, Sb, Se)",29.0,51.0,22.0,34.25,4.1875,34.0,64.0,...,0.0,0.0,0.0,0.0,14.0,225.0,211.0,112.125,98.125,14.0
3,Pt1Sb2,0.08,"(Pt, Sb)",51.0,78.0,27.0,60.0,12.0,51.0,63.0,...,0.0,0.0,0.0,0.0,166.0,225.0,59.0,185.666667,26.222222,166.0
4,Zn1In2S4,2.68,"(Zn, In, S)",16.0,49.0,33.0,27.428571,13.061224,16.0,69.0,...,0.0,0.0,0.0,0.0,70.0,194.0,124.0,107.428571,42.77551,70.0


#### Nan值处理

In [13]:
# 查看Nan值超过10%的特征
mp_nan = mp_featurized_df.isna().sum() / len(mp_featurized_df)
mp_nan = mp_nan[mp_nan > 0.1]
mp_nan = mp_nan.index.tolist()
mp_nan

[]

In [14]:
# exp查看Nan值超过20%的特征
exp_nan = exp_featurized_df.isna().sum() / len(exp_featurized_df)
exp_nan = exp_nan[exp_nan > 0.1]
exp_nan = exp_nan.index.tolist()
exp_nan

[]

In [15]:
# 查看存在Nan值的列
mp_nan = mp_featurized_df.isna().sum() / len(mp_featurized_df)
mp_nan = mp_nan[mp_nan > 0]
# mp_nan = mp_nan.index.tolist()
mp_nan

Series([], dtype: float64)

In [16]:
# 查看存在Nan值的行数
mp_na_row = mp_featurized_df.isna().sum(axis=1)
mp_na_row = mp_na_row[mp_na_row > 0]
exp_na_row = exp_featurized_df.isna().sum(axis=1)
exp_na_row = exp_na_row[exp_na_row > 0]
len(mp_na_row), len(exp_na_row)

(0, 0)

In [17]:
# 去掉所有存在Nan值的行
mp_featurized_df = mp_featurized_df.dropna()
exp_featurized_df = exp_featurized_df.dropna()
mp_featurized_df.shape, exp_featurized_df.shape

((10342, 135), (3130, 135))

In [18]:
mp_featurized_df.drop(columns=['composition'], inplace=True)    
mp_featurized_df.rename(columns={'composition_str': 'composition'}, inplace=True)
mp_featurized_df.head()

Unnamed: 0,composition,band_gap,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Ac16S24,2.2962,16.0,89.0,73.0,45.2,35.04,16.0,14.0,88.0,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0
1,Ac1Ag1Te2,0.0794,47.0,89.0,42.0,60.0,14.5,52.0,14.0,90.0,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
2,Ac1Cr1O3,2.0031,8.0,89.0,81.0,27.4,24.64,8.0,14.0,87.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
3,Ac1Fe1O3,0.9888,8.0,89.0,81.0,27.8,24.48,8.0,14.0,87.0,...,2.110663,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0
4,Ac1Ga1O3,2.8959,8.0,89.0,81.0,28.8,24.96,8.0,14.0,87.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,65.0,64.0,12.0


In [19]:
exp_featurized_df.drop(columns=['composition'], inplace=True)
exp_featurized_df.rename(columns={'composition_str': 'composition'}, inplace=True)
exp_featurized_df.head()

Unnamed: 0,composition,band_gap,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Hg0.7Cd0.3Te1,0.35,48.0,80.0,32.0,61.2,13.16,52.0,70.0,90.0,...,0.0,0.0,0.0,0.0,152.0,194.0,42.0,163.2,11.2,152.0
1,Lu1P1,1.3,15.0,71.0,56.0,43.0,28.0,15.0,41.0,83.0,...,0.002247,0.001124,0.001124,0.0,2.0,194.0,192.0,98.0,96.0,2.0
2,Cu3Sb1Se4,0.4,29.0,51.0,22.0,34.25,4.1875,34.0,64.0,89.0,...,0.0,0.0,0.0,0.0,14.0,225.0,211.0,112.125,98.125,14.0
3,Pt1Sb2,0.08,51.0,78.0,27.0,60.0,12.0,51.0,63.0,85.0,...,0.0,0.0,0.0,0.0,166.0,225.0,59.0,185.666667,26.222222,166.0
4,Zn1In2S4,2.68,16.0,49.0,33.0,27.428571,13.061224,16.0,69.0,88.0,...,0.0,0.0,0.0,0.0,70.0,194.0,124.0,107.428571,42.77551,70.0


In [20]:
# 将列名去掉MagpieData
mp_featurized_df.columns = mp_featurized_df.columns.str.replace('MagpieData ', '')
exp_featurized_df.columns = exp_featurized_df.columns.str.replace('MagpieData ', '')
mp_featurized_df.head()

Unnamed: 0,composition,band_gap,minimum Number,maximum Number,range Number,mean Number,avg_dev Number,mode Number,minimum MendeleevNumber,maximum MendeleevNumber,...,range GSmagmom,mean GSmagmom,avg_dev GSmagmom,mode GSmagmom,minimum SpaceGroupNumber,maximum SpaceGroupNumber,range SpaceGroupNumber,mean SpaceGroupNumber,avg_dev SpaceGroupNumber,mode SpaceGroupNumber
0,Ac16S24,2.2962,16.0,89.0,73.0,45.2,35.04,16.0,14.0,88.0,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0
1,Ac1Ag1Te2,0.0794,47.0,89.0,42.0,60.0,14.5,52.0,14.0,90.0,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
2,Ac1Cr1O3,2.0031,8.0,89.0,81.0,27.4,24.64,8.0,14.0,87.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
3,Ac1Fe1O3,0.9888,8.0,89.0,81.0,27.8,24.48,8.0,14.0,87.0,...,2.110663,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0
4,Ac1Ga1O3,2.8959,8.0,89.0,81.0,28.8,24.96,8.0,14.0,87.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,65.0,64.0,12.0


In [21]:
exp_featurized_df.head()

Unnamed: 0,composition,band_gap,minimum Number,maximum Number,range Number,mean Number,avg_dev Number,mode Number,minimum MendeleevNumber,maximum MendeleevNumber,...,range GSmagmom,mean GSmagmom,avg_dev GSmagmom,mode GSmagmom,minimum SpaceGroupNumber,maximum SpaceGroupNumber,range SpaceGroupNumber,mean SpaceGroupNumber,avg_dev SpaceGroupNumber,mode SpaceGroupNumber
0,Hg0.7Cd0.3Te1,0.35,48.0,80.0,32.0,61.2,13.16,52.0,70.0,90.0,...,0.0,0.0,0.0,0.0,152.0,194.0,42.0,163.2,11.2,152.0
1,Lu1P1,1.3,15.0,71.0,56.0,43.0,28.0,15.0,41.0,83.0,...,0.002247,0.001124,0.001124,0.0,2.0,194.0,192.0,98.0,96.0,2.0
2,Cu3Sb1Se4,0.4,29.0,51.0,22.0,34.25,4.1875,34.0,64.0,89.0,...,0.0,0.0,0.0,0.0,14.0,225.0,211.0,112.125,98.125,14.0
3,Pt1Sb2,0.08,51.0,78.0,27.0,60.0,12.0,51.0,63.0,85.0,...,0.0,0.0,0.0,0.0,166.0,225.0,59.0,185.666667,26.222222,166.0
4,Zn1In2S4,2.68,16.0,49.0,33.0,27.428571,13.061224,16.0,69.0,88.0,...,0.0,0.0,0.0,0.0,70.0,194.0,124.0,107.428571,42.77551,70.0


### 2. MP数据加入其他物理特征

In [22]:
# load the data
mp_plus_df = pd.read_csv('./Data/plus_data/mp_band_gap_plus.csv')

In [23]:
mp_plus_df.head()

Unnamed: 0,material_id,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,is_gap_direct,band_gap,is_oxide
0,mp-32800,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,0.0,2.2962,0.0
1,mp-867311,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,0.0,0.0794,0.0
2,mp-866101,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,0.0,2.0031,1.0
3,mp-861502,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,0.0,0.9888,1.0
4,mp-1183053,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,0.0,2.8959,1.0


In [24]:
mp_plus_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [25]:
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
mp_plus_df = stc.featurize_dataframe(mp_plus_df, col_id="composition_str", pbar=True)

StrToComposition: 100%|██████████| 10342/10342 [00:02<00:00, 3979.76it/s]


In [26]:
# composition-based features
mp_featurized_df2 = featurizer.featurize_dataframe(mp_plus_df, col_id='composition')

ElementProperty: 100%|██████████| 10342/10342 [00:17<00:00, 579.08it/s]


In [27]:
mp_featurized_df2.head()    

Unnamed: 0,material_id,composition_str,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,mp-32800,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0
1,mp-867311,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
2,mp-866101,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
3,mp-861502,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,...,2.110663,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0
4,mp-1183053,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,65.0,64.0,12.0


In [28]:
# 查看存在Nan值的行数
mp_na_row = mp_featurized_df2.isna().sum(axis=1)
mp_na_row = mp_na_row[mp_na_row > 0]
len(mp_na_row)

0

In [29]:
# 去掉所有存在Nan值的行
mp_featurized_df2 = mp_featurized_df2.dropna()
mp_featurized_df2.shape

(10342, 147)

In [30]:
mp_featurized_df2['composition_str'] = mp_featurized_df2['composition']
mp_featurized_df2.drop(columns=['composition'], inplace=True)    
mp_featurized_df2.rename(columns={'composition_str': 'composition'}, inplace=True)
mp_featurized_df2['composition'] = mp_featurized_df2['composition'].apply(lambda x: str(x).replace(" ", ""))
mp_featurized_df2.head()

Unnamed: 0,material_id,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,mp-32800,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0
1,mp-867311,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
2,mp-866101,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
3,mp-861502,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,...,2.110663,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0
4,mp-1183053,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,65.0,64.0,12.0


In [31]:
mp_featurized_df2.drop(columns=['material_id'], inplace=True)

In [32]:
mp_featurized_df2.columns = mp_featurized_df2.columns.str.replace('MagpieData ', '')
mp_featurized_df2.head()

Unnamed: 0,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,...,range GSmagmom,mean GSmagmom,avg_dev GSmagmom,mode GSmagmom,minimum SpaceGroupNumber,maximum SpaceGroupNumber,range SpaceGroupNumber,mean SpaceGroupNumber,avg_dev SpaceGroupNumber,mode SpaceGroupNumber
0,Ac16S24,40,1,122,1118.407852,6.535149,27.960196,5.79182,-34.768478,-2.492486,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0
1,Ac1Ag1Te2,4,0,225,122.518406,7.997421,30.629602,6.031096,-36.203183,-0.996232,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
2,Ac1Cr1O3,5,0,221,61.362845,8.848788,12.272569,6.364737,-8.862593,-3.138972,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
3,Ac1Fe1O3,5,0,221,61.797311,8.889999,12.359462,6.509045,-8.258555,-2.771539,...,2.110663,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0
4,Ac1Ga1O3,5,0,221,61.455078,9.314495,12.291016,5.476935,-7.461883,-3.063253,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,65.0,64.0,12.0


## Save Data

In [33]:
# save the data to csv file
import os
file_path = os.path.join(os.getcwd(), "./Data/featured_data")
os.makedirs(file_path, exist_ok=True)
mp_featurized_df.to_csv(os.path.join(file_path, 'mp_elements_feat.csv'), index=False)
exp_featurized_df.to_csv(os.path.join(file_path, 'exp_elements_feat.csv'), index=False)
mp_featurized_df2.to_csv(os.path.join(file_path, 'mp_elements_plus_feat.csv'), index=False)