# Generate features by composition 

In [1]:
import pandas as pd

# get data of mp/aflow/exp from the file 
mp_df = pd.read_csv("mp_band_gap.csv")
aflow_df = pd.read_csv("aflow_band_gap.csv")
exp_df = pd.read_csv("exp_band_gap.csv")

In [2]:
# Take a look at the degree of data difference between mp and aflow.
same_df = pd.merge(mp_df, aflow_df, on='composition', how='inner')
same_df.rename(columns={'band_gap_x': 'mp_band_gap', 'band_gap_y': 'aflow_band_gap'}, inplace=True)
same_df['diff'] = abs(same_df['mp_band_gap'] - same_df['aflow_band_gap'])
same_df.head(10)

Unnamed: 0,composition,mp_band_gap,aflow_band_gap,diff
0,F2,2.8962,1.763767,1.132433
1,O2,1.426,1.3985,0.0275
2,Cl1F1,2.0897,2.0695,0.0202
3,Cl1F3,2.5175,2.7251,0.2076
4,Cl1O2,0.9646,0.98196,0.01736
5,Cl1O3,1.2103,1.1917,0.0186
6,Cl2,2.5434,2.377857,0.165543
7,Cl2O7,3.6753,3.677,0.0017
8,N1O2,2.781,2.4494,0.3316
9,N2O5,0.6751,0.6446,0.0305


In [3]:
same_df.describe()

Unnamed: 0,mp_band_gap,aflow_band_gap,diff
count,1290.0,1290.0,1290.0
mean,1.69077,1.776453,0.251554
std,1.218918,1.195196,0.447116
min,0.0001,0.0871,0.0001
25%,0.739975,0.864507,0.022825
50%,1.4566,1.54255,0.0961
75%,2.324425,2.38351,0.294466
max,5.9647,5.993091,4.7034


In [4]:
print("Num of diff > 1.0: ", len(same_df[same_df['diff'] > 1.0]))
print("Ratio of diff > 1.0: ", len(same_df[same_df['diff'] > 1.0]) / len(same_df))

Num of diff > 1.0:  66
Ratio of diff > 1.0:  0.05116279069767442


In [5]:
# get rid of the composition in mp_df and aflow_df that the diff > 1.0
mp_df = mp_df[~mp_df['composition'].isin(same_df[same_df['diff'] > 1.0]['composition'])]
aflow_df = aflow_df[~aflow_df['composition'].isin(same_df[same_df['diff'] > 1.0]['composition'])]
print("Num of mp_df: ", len(mp_df))
print("Num of aflow_df: ", len(aflow_df))

Num of mp_df:  40586
Num of aflow_df:  12551


In [6]:
# both mp and aflow are dft calculations data, so combine them
dft_df = pd.concat([mp_df, aflow_df], ignore_index=True)
dft_df.shape

(53137, 2)

In [7]:
# for the same composition, the band gap from mp and aflow are different, so we need to take the average value
dft_df = dft_df.groupby("composition").agg({"band_gap": "mean"}).reset_index()
dft_df.shape

(51913, 2)

## Feature Engineering

In [8]:
# Change the name of composition column
dft_df.rename(columns={'composition': 'composition_str'}, inplace=True)
exp_df.rename(columns={'composition': 'composition_str'}, inplace=True)

In [9]:
dft_df.head()

Unnamed: 0,composition_str,band_gap
0,Ac1Ag1Te2,0.0794
1,Ac1Al1O3,4.1024
2,Ac1Br1O1,4.241
3,Ac1Br3,4.10585
4,Ac1Cl1O1,4.4451


In [10]:
# transform the composition_str column to composition column
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition')
dft_df = stc.featurize_dataframe(dft_df, col_id="composition_str", pbar=True)

StrToComposition:   0%|          | 0/51913 [00:00<?, ?it/s]

In [11]:
exp_df = stc.featurize_dataframe(exp_df, col_id="composition_str", pbar=True)

StrToComposition:   0%|          | 0/2414 [00:00<?, ?it/s]

In [12]:
dft_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)"
1,Ac1Al1O3,4.1024,"(Ac, Al, O)"
2,Ac1Br1O1,4.241,"(Ac, Br, O)"
3,Ac1Br3,4.10585,"(Ac, Br)"
4,Ac1Cl1O1,4.4451,"(Ac, Cl, O)"


In [13]:
exp_df.head()

Unnamed: 0,composition_str,band_gap,composition
0,Se1S1,1.9,"(Se, S)"
1,C1Br4,3.7,"(C, Br)"
2,C1I4,2.26,"(C, I)"
3,W1O3,2.8,"(W, O)"
4,W1Se2,1.45,"(W, Se)"


In [14]:
# composition-based features
# very long time to run
from matminer.featurizers.composition import ElementProperty
featurizer = ElementProperty.from_preset('magpie') 	# extracting features from a predefined database， e.g. magpie, jarvis, etc.
dft_featurized_df = featurizer.featurize_dataframe(dft_df, col_id='composition')

ElementProperty:   0%|          | 0/51913 [00:00<?, ?it/s]

In [15]:
dft_featurized_df.head()

Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)",47.0,89.0,42.0,60.0,14.5,52.0,14.0,...,0.0,0.0,0.0,0.0,152.0,225.0,73.0,188.5,36.5,152.0
1,Ac1Al1O3,4.1024,"(Ac, Al, O)",8.0,89.0,81.0,25.2,25.52,8.0,14.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,97.2,102.24,12.0
2,Ac1Br1O1,4.241,"(Ac, Br, O)",8.0,89.0,81.0,44.0,30.0,8.0,14.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,100.333333,83.111111,12.0
3,Ac1Br3,4.10585,"(Ac, Br)",35.0,89.0,54.0,48.5,20.25,35.0,14.0,...,0.0,0.0,0.0,0.0,64.0,225.0,161.0,104.25,60.375,64.0
4,Ac1Cl1O1,4.4451,"(Ac, Cl, O)",8.0,89.0,81.0,38.0,34.0,8.0,14.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,100.333333,83.111111,12.0


In [16]:
exp_featurized_df = featurizer.featurize_dataframe(exp_df, col_id='composition')
exp_featurized_df.head()

ElementProperty:   0%|          | 0/2414 [00:00<?, ?it/s]

Unnamed: 0,composition_str,band_gap,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,MagpieData mode Number,MagpieData minimum MendeleevNumber,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,Se1S1,1.9,"(Se, S)",16.0,34.0,18.0,25.0,9.0,16.0,88.0,...,0.0,0.0,0.0,0.0,14.0,70.0,56.0,42.0,28.0,14.0
1,C1Br4,3.7,"(C, Br)",6.0,35.0,29.0,29.2,9.28,35.0,77.0,...,0.0,0.0,0.0,0.0,64.0,194.0,130.0,90.0,41.6,64.0
2,C1I4,2.26,"(C, I)",6.0,53.0,47.0,43.6,15.04,53.0,77.0,...,0.0,0.0,0.0,0.0,64.0,194.0,130.0,90.0,41.6,64.0
3,W1O3,2.8,"(W, O)",8.0,74.0,66.0,24.5,24.75,8.0,51.0,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,66.25,81.375,12.0
4,W1Se2,1.45,"(W, Se)",34.0,74.0,40.0,47.333333,17.777778,34.0,51.0,...,0.0,0.0,0.0,0.0,14.0,229.0,215.0,85.666667,95.555556,14.0


In [17]:
# save the data to csv file
import os
if not os.path.exists("./feature_data"):
    os.makedirs("./feature_data")
dft_featurized_df.to_csv("./feature_data/dft_featurized.csv", index=False)
exp_featurized_df.to_csv("./feature_data/exp_featurized.csv", index=False)

In [18]:
# check the number of elements in the composition of dft and exp data
# dft data
dft_featurized_df.loc[:, 'composition'].apply(lambda x: len(x.elements)).describe()

count    51913.000000
mean         3.814960
std          0.944345
min          1.000000
25%          3.000000
50%          4.000000
75%          4.000000
max          8.000000
Name: composition, dtype: float64

In [19]:
# exp data
exp_featurized_df.loc[:, 'composition'].apply(lambda x: len(x.elements)).describe()

count    2414.000000
mean        3.186827
std         0.761904
min         2.000000
25%         3.000000
50%         3.000000
75%         4.000000
max         4.000000
Name: composition, dtype: float64

- dft calculation data has elements from 1 to 8.
- experiment data has elements from 2 to 4.