# Generate features by composition 

In [1]:
import pandas as pd

# get data of mp/exp from the file 
mp_df = pd.read_csv("./Data/composition_data/mp_band_gap.csv")
exp_df = pd.read_csv("./Data/composition_data/exp_band_gap.csv")

In [2]:
mp_df.shape, exp_df.shape

((5472, 2), (1930, 2))

## Feature Engineering

### 1. 基于元素组成进行特征化

In [3]:
# Change the name of composition column
mp_df.rename(columns={'composition': 'composition_str'}, inplace=True)
exp_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [4]:
mp_df.head()

Unnamed: 0,composition_str,band_gap
0,Ag8B48Cl48,2.7028
1,Ag40Te16Br12,0.8722
2,Ag30P8S32Cl6,1.2888
3,Ag4C2O6,0.4736
4,Ag12Ge6S36O126,2.8072


In [5]:
# transform the composition_str column to composition column
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
mp_df = stc.featurize_dataframe(mp_df, col_id="composition_str", pbar=True)

StrToComposition:   0%|          | 0/5472 [00:00<?, ?it/s]

In [6]:
exp_df = stc.featurize_dataframe(exp_df, col_id="composition_str", pbar=True)

StrToComposition: 100%|██████████| 1930/1930 [00:00<00:00, 7888.94it/s] 



In [7]:
mp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Ag8B48Cl48,2.7028,"(Ag, B, Cl)"
1,Ag40Te16Br12,0.8722,"(Ag, Te, Br)"
2,Ag30P8S32Cl6,1.2888,"(Ag, P, S, Cl)"
3,Ag4C2O6,0.4736,"(Ag, C, O)"
4,Ag12Ge6S36O126,2.8072,"(Ag, Ge, S, O)"


In [8]:
exp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Cu3Sb1Se4,0.4,"(Cu, Sb, Se)"
1,Zn1In2S4,2.68,"(Zn, In, S)"
2,K2Cd3Te4,2.26,"(K, Cd, Te)"
3,In1Sb1,0.22,"(In, Sb)"
4,K2Cu1Nb1S4,2.82,"(K, Cu, Nb, S)"


In [None]:
# composition-based features
from matminer.featurizers.composition import ElementProperty
featurizer = ElementProperty.from_preset('magpie') 	# extracting features from a predefined database， e.g. magpie, jarvis, etc.
mp_featurized_df = featurizer.featurize_dataframe(mp_df, col_id='composition')

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
ElementProperty: 100%|██████████| 5520/5520 [00:09<00:00, 596.61it/s]


In [12]:
mp_featurized_df.head()

Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Ag8B48Cl48,2.7028,"(Ag, B, Cl)",5.0,47.0,42.0,13.769231,8.094675,5.0,65.0,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,123.461538,54.887574,64.0
1,Ag40Te16Br12,0.8722,"(Ag, Te, Br)",35.0,52.0,17.0,46.058824,3.903114,47.0,65.0,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,179.411765,53.633218,225.0
2,Ag30P8S32Cl6,1.2888,"(Ag, P, S, Cl)",15.0,47.0,32.0,28.210526,14.833795,16.0,65.0,...,0.0,0.0,0.0,0.0,2.0,225.0,223.0,123.552632,80.090028,70.0
3,Ag4C2O6,0.4736,"(Ag, C, O)",6.0,47.0,41.0,20.666667,17.555556,8.0,65.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,113.333333,101.333333,12.0
4,Ag12Ge6S36O126,2.8072,"(Ag, Ge, S, O)",8.0,47.0,39.0,13.0,7.0,8.0,65.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,44.9,46.06,12.0


In [13]:
exp_featurized_df = featurizer.featurize_dataframe(exp_df, col_id='composition')
exp_featurized_df.head()

ElementProperty: 100%|██████████| 1930/1930 [00:03<00:00, 637.67it/s]


Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Cu3Sb1Se4,0.4,"(Cu, Sb, Se)",29.0,51.0,22.0,34.25,4.1875,34.0,64.0,...,0.0,0.0,0.0,0.0,14.0,225.0,211.0,112.125,98.125,14.0
1,Zn1In2S4,2.68,"(Zn, In, S)",16.0,49.0,33.0,27.428571,13.061224,16.0,69.0,...,0.0,0.0,0.0,0.0,70.0,194.0,124.0,107.428571,42.77551,70.0
2,K2Cd3Te4,2.26,"(K, Cd, Te)",19.0,52.0,33.0,43.333333,10.814815,52.0,3.0,...,0.0,0.0,0.0,0.0,152.0,229.0,77.0,183.111111,27.654321,152.0
3,In1Sb1,0.22,"(In, Sb)",49.0,51.0,2.0,50.0,1.0,49.0,75.0,...,0.0,0.0,0.0,0.0,139.0,166.0,27.0,152.5,13.5,139.0
4,K2Cu1Nb1S4,2.82,"(K, Cu, Nb, S)",16.0,41.0,25.0,21.5,6.75,16.0,3.0,...,0.0,0.0,0.0,0.0,70.0,229.0,159.0,149.0,79.0,70.0


In [20]:
mp_featurized_df.isnull().any().sum()

np.int64(0)

In [21]:
exp_featurized_df.isnull().any().sum()

np.int64(0)

In [22]:
mp_featurized_df.drop(columns=['composition'], inplace=True)    
mp_featurized_df.rename(columns={'composition_str': 'composition'}, inplace=True)
mp_featurized_df.head()

Unnamed: 0,composition,band_gap,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Ag8B48Cl48,2.7028,5.0,47.0,42.0,13.769231,8.094675,5.0,65.0,94.0,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,123.461538,54.887574,64.0
1,Ag40Te16Br12,0.8722,35.0,52.0,17.0,46.058824,3.903114,47.0,65.0,95.0,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,179.411765,53.633218,225.0
2,Ag30P8S32Cl6,1.2888,15.0,47.0,32.0,28.210526,14.833795,16.0,65.0,94.0,...,0.0,0.0,0.0,0.0,2.0,225.0,223.0,123.552632,80.090028,70.0
3,Ag4C2O6,0.4736,6.0,47.0,41.0,20.666667,17.555556,8.0,65.0,87.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,113.333333,101.333333,12.0
4,Ag12Ge6S36O126,2.8072,8.0,47.0,39.0,13.0,7.0,8.0,65.0,88.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,44.9,46.06,12.0


In [23]:
exp_featurized_df.drop(columns=['composition'], inplace=True)
exp_featurized_df.rename(columns={'composition_str': 'composition'}, inplace=True)
exp_featurized_df.head()

Unnamed: 0,composition,band_gap,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Cu3Sb1Se4,0.4,29.0,51.0,22.0,34.25,4.1875,34.0,64.0,89.0,...,0.0,0.0,0.0,0.0,14.0,225.0,211.0,112.125,98.125,14.0
1,Zn1In2S4,2.68,16.0,49.0,33.0,27.428571,13.061224,16.0,69.0,88.0,...,0.0,0.0,0.0,0.0,70.0,194.0,124.0,107.428571,42.77551,70.0
2,K2Cd3Te4,2.26,19.0,52.0,33.0,43.333333,10.814815,52.0,3.0,90.0,...,0.0,0.0,0.0,0.0,152.0,229.0,77.0,183.111111,27.654321,152.0
3,In1Sb1,0.22,49.0,51.0,2.0,50.0,1.0,49.0,75.0,85.0,...,0.0,0.0,0.0,0.0,139.0,166.0,27.0,152.5,13.5,139.0
4,K2Cu1Nb1S4,2.82,16.0,41.0,25.0,21.5,6.75,16.0,3.0,88.0,...,0.0,0.0,0.0,0.0,70.0,229.0,159.0,149.0,79.0,70.0


In [24]:
# save the data to csv file
import os
file_path = os.path.join(os.getcwd(), "./Data/featured_data")
os.makedirs(file_path, exist_ok=True)
mp_featurized_df.to_csv(os.path.join(file_path, 'mp_elements_feat.csv'), index=False)
exp_featurized_df.to_csv(os.path.join(file_path, 'exp_elements_feat.csv'), index=False)

### 2. MP数据加入其他物理特征

In [None]:
# load the data
mp_eda_df = pd.read_csv('./Data/mp_eda_data.csv')

In [2]:
mp_eda_df.head()

Unnamed: 0,material_id,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,formation_energy_per_atom,is_gap_direct,band_gap,is_oxide
0,mp-567334,Ag8 B48 Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,-0.736951,1.0,2.7028,0.0
1,mp-568392,Ag40 Te16 Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,-0.366432,0.0,0.8722,0.0
2,mp-560328,Ag30 P8 S32 Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,-0.481548,0.0,1.2888,0.0
3,mp-4691,Ag4 C2 O6,12,4,11,150.041421,6.103459,12.503452,1.709619,-6.049623,-1.089885,0.0,0.4736,1.0
4,mp-1196546,Ag12 Ge6 S36 O126,180,6,147,2544.837686,3.197654,14.137987,0.457826,-6.23463,-1.599755,0.0,2.8072,1.0


In [3]:
mp_eda_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [4]:
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
mp_eda_df = stc.featurize_dataframe(mp_eda_df, col_id="composition_str", pbar=True)

  from .autonotebook import tqdm as notebook_tqdm
StrToComposition: 100%|██████████| 5520/5520 [00:00<00:00, 5591.83it/s] 



In [5]:
# composition-based features
from matminer.featurizers.composition import ElementProperty
featurizer = ElementProperty.from_preset('magpie') 	# extracting features from a predefined database， e.g. magpie, jarvis, etc.
mp_featurized_df2 = featurizer.featurize_dataframe(mp_eda_df, col_id='composition')

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
ElementProperty: 100%|██████████| 5520/5520 [00:09<00:00, 602.60it/s]



In [6]:
mp_featurized_df2.head()    

Unnamed: 0,material_id,composition_str,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,mp-567334,Ag8 B48 Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,123.461538,54.887574,64.0
1,mp-568392,Ag40 Te16 Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,179.411765,53.633218,225.0
2,mp-560328,Ag30 P8 S32 Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,...,0.0,0.0,0.0,0.0,2.0,225.0,223.0,123.552632,80.090028,70.0
3,mp-4691,Ag4 C2 O6,12,4,11,150.041421,6.103459,12.503452,1.709619,-6.049623,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,113.333333,101.333333,12.0
4,mp-1196546,Ag12 Ge6 S36 O126,180,6,147,2544.837686,3.197654,14.137987,0.457826,-6.23463,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,44.9,46.06,12.0


In [7]:
mp_featurized_df2.drop(columns=['composition'], inplace=True)    
mp_featurized_df2.rename(columns={'composition_str': 'composition'}, inplace=True)
mp_featurized_df2['composition'] = mp_featurized_df2['composition'].apply(lambda x: str(x).replace(" ", ""))
mp_featurized_df2.head()

Unnamed: 0,material_id,composition,nsites,crystal_system,space_group,volume,density,density_atomic,efermi,energy_per_atom,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,mp-567334,Ag8B48Cl48,104,0,205,1920.521818,2.666186,18.466556,0.446,-8.609923,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,123.461538,54.887574,64.0
1,mp-568392,Ag40Te16Br12,68,3,63,1688.336988,7.194734,24.828485,2.536429,-3.067222,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,179.411765,53.633218,225.0
2,mp-560328,Ag30P8S32Cl6,76,0,220,1709.591255,4.587131,22.494622,2.458781,-4.062215,...,0.0,0.0,0.0,0.0,2.0,225.0,223.0,123.552632,80.090028,70.0
3,mp-4691,Ag4C2O6,12,4,11,150.041421,6.103459,12.503452,1.709619,-6.049623,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,113.333333,101.333333,12.0
4,mp-1196546,Ag12Ge6S36O126,180,6,147,2544.837686,3.197654,14.137987,0.457826,-6.23463,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,44.9,46.06,12.0


In [8]:
mp_featurized_df2.drop(columns=['material_id'], inplace=True)

In [None]:
# save the data to csv file
mp_featurized_df2.to_csv(os.path.join(file_path, 'mp_elements_plus_feat.csv'), index=False)