# Featurization

In [27]:
# get the data from the file
import pandas as pd

data = pd.read_csv('./data/cleaned_data_without_outliers.csv')

In [28]:
data.head()

Unnamed: 0,material_id,formula_pretty,composition,crystal_system,space_group,point_group,band_gap,volume,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic
0,mp-28967,Ba(PdS2)2,Ba2 Pd4 S8,4,11,2/m,0.7792,316.341404,5.022717,22.595815,-1.124079,17.048334,7.118969,9.929365
1,mp-766094,NbO2F,Nb4 O8 F4,3,19,222,2.898,253.915299,3.764366,15.869706,-3.099174,17.57201,13.469477,4.102533
2,mp-36577,Sr(AsS2)2,Sr1 As2 S4,5,1,1,1.7212,196.220495,3.094976,28.031499,-0.7661,18.488667,12.443616,6.045051
3,mp-1102092,NaFe(SO4)2,Na4 Fe4 S8 O32,4,12,-1,2.0944,620.336826,2.90126,12.923684,-1.948264,9.596025,6.499905,3.09612
4,mp-720391,BH4NF4,B4 H16 N4 F16,3,62,mmm,7.4812,374.200384,1.860992,9.35501,-1.970766,6.216546,4.350314,1.866232


In [29]:
data.shape

(7262, 14)

In [30]:
# Change the name of composition column
data.rename(columns={'composition': 'composition_str'}, inplace=True)

In [31]:
# turn the composition column from str to composition
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
data = stc.featurize_dataframe(data, col_id="composition_str", pbar=True)

StrToComposition:   0%|          | 0/7262 [00:00<?, ?it/s]

In [32]:
data.head()

Unnamed: 0,material_id,formula_pretty,composition_str,crystal_system,space_group,point_group,band_gap,volume,density,density_atomic,formation_energy_per_atom,e_total,e_ionic,e_electronic,composition
0,mp-28967,Ba(PdS2)2,Ba2 Pd4 S8,4,11,2/m,0.7792,316.341404,5.022717,22.595815,-1.124079,17.048334,7.118969,9.929365,"(Ba, Pd, S)"
1,mp-766094,NbO2F,Nb4 O8 F4,3,19,222,2.898,253.915299,3.764366,15.869706,-3.099174,17.57201,13.469477,4.102533,"(Nb, O, F)"
2,mp-36577,Sr(AsS2)2,Sr1 As2 S4,5,1,1,1.7212,196.220495,3.094976,28.031499,-0.7661,18.488667,12.443616,6.045051,"(Sr, As, S)"
3,mp-1102092,NaFe(SO4)2,Na4 Fe4 S8 O32,4,12,-1,2.0944,620.336826,2.90126,12.923684,-1.948264,9.596025,6.499905,3.09612,"(Na, Fe, S, O)"
4,mp-720391,BH4NF4,B4 H16 N4 F16,3,62,mmm,7.4812,374.200384,1.860992,9.35501,-1.970766,6.216546,4.350314,1.866232,"(B, H, N, F)"


In [33]:
str(data.iloc[0, -1])

'Ba2 Pd4 S8'

In [34]:
# 1. Composition-based features
from matminer.featurizers.composition import ElementProperty
featurizer = ElementProperty.from_preset('magpie') 	# extracting features from a predefined database
data_featurized = featurizer.featurize_dataframe(data, col_id='composition')

ElementProperty:   0%|          | 0/7262 [00:00<?, ?it/s]

In [35]:
data_featurized.head()

Unnamed: 0,material_id,formula_pretty,composition_str,crystal_system,space_group,point_group,band_gap,volume,density,density_atomic,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,mp-28967,Ba(PdS2)2,Ba2 Pd4 S8,4,11,2/m,0.7792,316.341404,5.022717,22.595815,...,0.0,0.0,0.0,0.0,70.0,229.0,159.0,137.0,76.571429,70.0
1,mp-766094,NbO2F,Nb4 O8 F4,3,19,222,2.898,253.915299,3.764366,15.869706,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,67.0,81.0,12.0
2,mp-36577,Sr(AsS2)2,Sr1 As2 S4,5,1,1,1.7212,196.220495,3.094976,28.031499,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,119.571429,56.653061,70.0
3,mp-1102092,NaFe(SO4)2,Na4 Fe4 S8 O32,4,12,-1,2.0944,620.336826,2.90126,12.923684,...,2.110663,0.175889,0.322462,0.0,12.0,229.0,217.0,57.833333,61.111111,12.0
4,mp-720391,BH4NF4,B4 H16 N4 F16,3,62,mmm,7.4812,374.200384,1.860992,9.35501,...,0.0,0.0,0.0,0.0,15.0,194.0,179.0,119.6,83.68,15.0


#### The features based on physical knowledge:
- Atomic type --- composition
- Crystal structure --- symmetry=SymmetryData(crystal_system=<CrystalSystem.trig: 'Trigonal'>, symbol='P3_121', number=152, point_group='32', symprec=0.1, version='1.16.5'),
- band_gap
- Atomic density per unit volume, ionic pair density per unit volume
      --- density + density_atomic + volumn
- Electronic polarization rate$\alpha_e$
  - Electronic cloud radius$r_0$ --- 'MagpieData minimum CovalentRadius', 'MagpieData maximum CovalentRadius', 'MagpieData range CovalentRadius','MagpieData mean CovalentRadius',
  - The number of electron --- 'MagpieData minimum Number', 'MagpieData maximum Number', 'MagpieData range Number', 'MagpieData mean Number',
- Covalent electron polarization
  - electronegativity --- 'MagpieData minimum Electronegativity', 'MagpieData maximum Electronegativity', 'MagpieData range Electronegativity', 'MagpieData mean Electronegativity'
  - The number of electron

In [36]:
domain_features = ['crystal_system', 'space_group', 'band_gap', 'volume', 'density', 'density_atomic', 
                'formation_energy_per_atom']

domain_features += ['MagpieData minimum Number', 'MagpieData maximum Number', 'MagpieData range Number', 'MagpieData mean Number',
                 'MagpieData minimum CovalentRadius', 'MagpieData maximum CovalentRadius', 'MagpieData range CovalentRadius', 'MagpieData mean CovalentRadius',
                 'MagpieData minimum Electronegativity', 'MagpieData maximum Electronegativity', 'MagpieData range Electronegativity', 'MagpieData mean Electronegativity'
                 ]
domain_features

['crystal_system',
 'space_group',
 'band_gap',
 'volume',
 'density',
 'density_atomic',
 'formation_energy_per_atom',
 'MagpieData minimum Number',
 'MagpieData maximum Number',
 'MagpieData range Number',
 'MagpieData mean Number',
 'MagpieData minimum CovalentRadius',
 'MagpieData maximum CovalentRadius',
 'MagpieData range CovalentRadius',
 'MagpieData mean CovalentRadius',
 'MagpieData minimum Electronegativity',
 'MagpieData maximum Electronegativity',
 'MagpieData range Electronegativity',
 'MagpieData mean Electronegativity']

In [37]:
domain_data = data_featurized[domain_features + ['e_electronic', 'e_ionic', 'e_total']]

In [38]:
domain_data.head()

Unnamed: 0,crystal_system,space_group,band_gap,volume,density,density_atomic,formation_energy_per_atom,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,...,MagpieData maximum CovalentRadius,MagpieData range CovalentRadius,MagpieData mean CovalentRadius,MagpieData minimum Electronegativity,MagpieData maximum Electronegativity,MagpieData range Electronegativity,MagpieData mean Electronegativity,e_electronic,e_ionic,e_total
0,4,11,0.7792,316.341404,5.022717,22.595815,-1.124079,16.0,56.0,40.0,...,215.0,110.0,130.428571,0.89,2.58,1.69,2.23,9.929365,7.118969,17.048334
1,3,19,2.898,253.915299,3.764366,15.869706,-3.099174,8.0,41.0,33.0,...,164.0,107.0,88.25,1.6,3.98,2.38,3.115,4.102533,13.469477,17.57201
2,5,1,1.7212,196.220495,3.094976,28.031499,-0.7661,16.0,38.0,22.0,...,195.0,90.0,121.857143,0.95,2.58,1.63,2.232857,6.045051,12.443616,18.488667
3,4,12,2.0944,620.336826,2.90126,12.923684,-1.948264,8.0,26.0,18.0,...,166.0,100.0,86.333333,0.93,3.44,2.51,2.953333,3.09612,6.499905,9.596025
4,3,62,7.4812,374.200384,1.860992,9.35501,-1.970766,1.0,9.0,8.0,...,84.0,53.0,50.7,2.04,3.98,1.94,2.98,1.866232,4.350314,6.216546


In [43]:
composition_features = data_featurized.columns[15:]
composition_features

Index(['MagpieData minimum Number', 'MagpieData maximum Number',
       'MagpieData range Number', 'MagpieData mean Number',
       'MagpieData avg_dev Number', 'MagpieData mode Number',
       'MagpieData minimum MendeleevNumber',
       'MagpieData maximum MendeleevNumber',
       'MagpieData range MendeleevNumber', 'MagpieData mean MendeleevNumber',
       ...
       'MagpieData range GSmagmom', 'MagpieData mean GSmagmom',
       'MagpieData avg_dev GSmagmom', 'MagpieData mode GSmagmom',
       'MagpieData minimum SpaceGroupNumber',
       'MagpieData maximum SpaceGroupNumber',
       'MagpieData range SpaceGroupNumber', 'MagpieData mean SpaceGroupNumber',
       'MagpieData avg_dev SpaceGroupNumber',
       'MagpieData mode SpaceGroupNumber'],
      dtype='object', length=132)

In [40]:
composition_data = data_featurized[list(composition_features) + ['e_electronic', 'e_ionic', 'e_total']]

In [41]:
composition_data.head()

Unnamed: 0,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,MagpieData maximum MendeleevNumber,MagpieData range MendeleevNumber,MagpieData mean MendeleevNumber,...,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber,e_electronic,e_ionic,e_total
0,16.0,56.0,40.0,30.285714,16.326531,16.0,9.0,88.0,79.0,69.285714,...,0.0,70.0,229.0,159.0,137.0,76.571429,70.0,9.929365,7.118969,17.048334
1,8.0,41.0,33.0,16.5,12.25,8.0,47.0,93.0,46.0,78.5,...,0.0,12.0,229.0,217.0,67.0,81.0,12.0,4.102533,13.469477,17.57201
2,16.0,38.0,22.0,24.0,9.142857,16.0,8.0,88.0,80.0,75.428571,...,0.0,70.0,225.0,155.0,119.571429,56.653061,70.0,6.045051,12.443616,18.488667
3,8.0,26.0,18.0,11.083333,4.125,8.0,2.0,88.0,86.0,77.416667,...,0.0,12.0,229.0,217.0,57.833333,61.111111,12.0,3.09612,6.499905,9.596025
4,1.0,9.0,8.0,5.2,3.4,1.0,72.0,93.0,21.0,89.4,...,0.0,15.0,194.0,179.0,119.6,83.68,15.0,1.866232,4.350314,6.216546


In [42]:
# save data to csv
domain_data.to_csv('./data/domain_based_data.csv', index=False)
composition_data.to_csv('./data/composition_based_data.csv', index=False)