# Generate features by composition 

In [1]:
import pandas as pd

# get data of mp/aflow/exp from the file 
mp_df = pd.read_csv("mp_band_gap.csv")
aflow_df = pd.read_csv("aflow_band_gap.csv")
exp_df = pd.read_csv("exp_band_gap.csv")

In [2]:
# Take a look at the degree of data difference between mp and aflow.
same_df = pd.merge(mp_df, aflow_df, on='composition', how='inner')
same_df.rename(columns={'band_gap_x': 'mp_band_gap', 'band_gap_y': 'aflow_band_gap'}, inplace=True)
same_df['diff'] = abs(same_df['mp_band_gap'] - same_df['aflow_band_gap'])
same_df.head(10)

Unnamed: 0,composition,mp_band_gap,aflow_band_gap,diff
0,F4,2.8962,1.763767,1.132433
1,O2,1.3227,1.5477,0.225
2,O8,2.285,1.3985,0.8865
3,Cl4,2.5434,2.377857,0.165543
4,Cl4F4,2.0897,2.0695,0.0202
5,Cl4F12,2.6321,2.5712,0.0609
6,Cl4O12,1.2103,1.1917,0.0186
7,Cl4O14,3.6753,3.677,0.0017
8,Cl8F24,2.5175,2.7251,0.2076
9,Cl8O16,0.9646,0.98196,0.01736


In [3]:
same_df.describe()

Unnamed: 0,mp_band_gap,aflow_band_gap,diff
count,1479.0,1479.0,1479.0
mean,1.746205,1.83492,0.226988
std,1.2417,1.218115,0.420932
min,0.0001,0.0871,0.0001
25%,0.78025,0.89205,0.0197
50%,1.5096,1.60355,0.084
75%,2.38545,2.49103,0.251325
max,5.9894,5.993091,4.2459


In [4]:
print("Num of diff > 1.0: ", len(same_df[same_df['diff'] > 1.0]))
print("Ratio of diff > 1.0: ", len(same_df[same_df['diff'] > 1.0]) / len(same_df))

Num of diff > 1.0:  70
Ratio of diff > 1.0:  0.04732927653820149


In [5]:
# get rid of the composition in mp_df and aflow_df that the diff > 1.0
mp_df = mp_df[~mp_df['composition'].isin(same_df[same_df['diff'] > 1.0]['composition'])]
aflow_df = aflow_df[~aflow_df['composition'].isin(same_df[same_df['diff'] > 1.0]['composition'])]
print("Num of mp_df: ", len(mp_df))
print("Num of aflow_df: ", len(aflow_df))

Num of mp_df:  45996
Num of aflow_df:  13556


In [6]:
# both mp and aflow are dft calculations data, so combine them
dft_df = pd.concat([mp_df, aflow_df], ignore_index=True)
dft_df.shape

(59552, 2)

In [7]:
# for the same composition, the band gap from mp and aflow are different, so we need to take the average value
dft_df = dft_df.groupby("composition").agg({"band_gap": "mean"}).reset_index()
dft_df.shape

(58143, 2)

## Feature Engineering

In [8]:
# Change the name of composition column
dft_df.rename(columns={'composition': 'composition_str'}, inplace=True)
exp_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [9]:
dft_df.head()

Unnamed: 0,composition_str,band_gap
0,Ac16S24,2.2962
1,Ac1Ag1Te2,0.0794
2,Ac1Al1O3,4.1024
3,Ac1Cr1O3,2.0031
4,Ac1Fe1O3,0.9888


In [10]:
# transform the composition_str column to composition column
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
dft_df = stc.featurize_dataframe(dft_df, col_id="composition_str", pbar=True)

  from .autonotebook import tqdm as notebook_tqdm
StrToComposition: 100%|██████████| 58143/58143 [00:15<00:00, 3760.06it/s]


In [11]:
exp_df = stc.featurize_dataframe(exp_df, col_id="composition_str", pbar=True)

StrToComposition: 100%|██████████| 2417/2417 [00:00<00:00, 6226.87it/s] 


In [12]:
dft_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Ac16S24,2.2962,"(Ac, S)"
1,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)"
2,Ac1Al1O3,4.1024,"(Ac, Al, O)"
3,Ac1Cr1O3,2.0031,"(Ac, Cr, O)"
4,Ac1Fe1O3,0.9888,"(Ac, Fe, O)"


In [13]:
exp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Se1S1,1.9,"(Se, S)"
1,C1Br4,3.7,"(C, Br)"
2,C1I4,2.26,"(C, I)"
3,W1O3,2.8,"(W, O)"
4,W1Se2,1.45,"(W, Se)"


In [14]:
# composition-based features
# very long time to run
from matminer.featurizers.composition import ElementProperty
featurizer = ElementProperty.from_preset('magpie') 	# extracting features from a predefined database， e.g. magpie, jarvis, etc.
dft_featurized_df = featurizer.featurize_dataframe(dft_df, col_id='composition')

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
ElementProperty: 100%|██████████| 58143/58143 [01:52<00:00, 516.31it/s]


In [15]:
dft_featurized_df.head()

Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Ac16S24,2.2962,"(Ac, S)",16.0,89.0,73.0,45.2,35.04,16.0,14.0,...,0.0,0.0,0.0,0.0,70.0,225.0,155.0,132.0,74.4,70.0
1,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)",47.0,89.0,42.0,60.0,14.5,52.0,14.0,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
2,Ac1Al1O3,4.1024,"(Ac, Al, O)",8.0,89.0,81.0,25.2,25.52,8.0,14.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,97.2,102.24,12.0
3,Ac1Cr1O3,2.0031,"(Ac, Cr, O)",8.0,89.0,81.0,27.4,24.64,8.0,14.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.0,103.2,12.0
4,Ac1Fe1O3,0.9888,"(Ac, Fe, O)",8.0,89.0,81.0,27.8,24.48,8.0,14.0,...,2.110663,0.422133,0.675412,0.0,12.0,229.0,217.0,98.0,103.2,12.0


In [16]:
exp_featurized_df = featurizer.featurize_dataframe(exp_df, col_id='composition')
exp_featurized_df.head()

ElementProperty: 100%|██████████| 2417/2417 [00:04<00:00, 541.51it/s]


Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Se1S1,1.9,"(Se, S)",16.0,34.0,18.0,25.0,9.0,16.0,88.0,...,0.0,0.0,0.0,0.0,14.0,70.0,56.0,42.0,28.0,14.0
1,C1Br4,3.7,"(C, Br)",6.0,35.0,29.0,29.2,9.28,35.0,77.0,...,0.0,0.0,0.0,0.0,64.0,194.0,130.0,90.0,41.6,64.0
2,C1I4,2.26,"(C, I)",6.0,53.0,47.0,43.6,15.04,53.0,77.0,...,0.0,0.0,0.0,0.0,64.0,194.0,130.0,90.0,41.6,64.0
3,W1O3,2.8,"(W, O)",8.0,74.0,66.0,24.5,24.75,8.0,51.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,66.25,81.375,12.0
4,W1Se2,1.45,"(W, Se)",34.0,74.0,40.0,47.333333,17.777778,34.0,51.0,...,0.0,0.0,0.0,0.0,14.0,229.0,215.0,85.666667,95.555556,14.0


In [17]:
# save the data to csv file
import os
if not os.path.exists("./feature_data"):
    os.makedirs("./feature_data")
dft_featurized_df.to_csv("./feature_data/dft_featurized.csv", index=False)
exp_featurized_df.to_csv("./feature_data/exp_featurized.csv", index=False)

In [18]:
# check the number of elements in the composition of dft and exp data
# dft data
dft_featurized_df.loc[:, 'composition'].apply(lambda x: len(x.elements)).describe()

count    58143.000000
mean         3.759765
std          0.954599
min          1.000000
25%          3.000000
50%          4.000000
75%          4.000000
max          8.000000
Name: composition, dtype: float64

In [19]:
# exp data
exp_featurized_df.loc[:, 'composition'].apply(lambda x: len(x.elements)).describe()

count    2417.000000
mean        3.186181
std         0.762375
min         2.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         4.000000
Name: composition, dtype: float64

- dft calculation data has elements from 1 to 8.
- experiment data has elements from 2 to 4.