# From MP Data Matching Experiment Data

In [110]:
import os
import pandas as pd
from pymatgen.core.composition import Composition

In [3]:
mp_df = pd.read_csv('./mp_band_gap.csv')
exp_df = pd.read_csv('./exp_band_gap.csv')

In [4]:
mp_df.head()

Unnamed: 0,composition,band_gap
0,Ac1Ag1Te2,0.0794
1,Ac1Al1O3,4.1024
2,Ac1Cr1O3,2.0031
3,Ac1Fe1O3,0.9888
4,Ac1Ga1O3,2.8959


In [28]:
# transform the composition_str column to composition column
from matminer.featurizers.conversions import StrToComposition

stc = StrToComposition(target_col_id='composition_type')
mp_trans_df = stc.featurize_dataframe(mp_df, 'composition', ignore_errors=True)

StrToComposition:   0%|          | 0/46071 [00:00<?, ?it/s]

In [29]:
mp_trans_df.rename(columns={'composition': 'composition_'}, inplace=True)

In [91]:
mp_trans_df['composition'] = mp_trans_df['composition_type'].apply(lambda x: x.reduced_composition)

In [None]:
# composition_: full composition
# composition: reduced composition
# composition_type: pymatgen composition object
mp_trans_df.head(30)

Unnamed: 0,composition_,band_gap,composition_type,composition
0,Ac1Ag1Te2,0.0794,"(Ac, Ag, Te)","(Ac, Ag, Te)"
1,Ac1Al1O3,4.1024,"(Ac, Al, O)","(Ac, Al, O)"
2,Ac1Cr1O3,2.0031,"(Ac, Cr, O)","(Ac, Cr, O)"
3,Ac1Fe1O3,0.9888,"(Ac, Fe, O)","(Ac, Fe, O)"
4,Ac1Ga1O3,2.8959,"(Ac, Ga, O)","(Ac, Ga, O)"
5,Ac1Ga1Te2,0.491,"(Ac, Ga, Te)","(Ac, Ga, Te)"
6,Ac1H3,0.6407,"(Ac, H)","(Ac, H)"
7,Ac1In1Te2,0.3799,"(Ac, In, Te)","(Ac, In, Te)"
8,Ac1Mg149,0.2168,"(Ac, Mg)","(Ac, Mg)"
9,Ac1O1F1,4.2566,"(Ac, O, F)","(Ac, O, F)"


In [102]:
mp_trans_df['composition'].value_counts()

composition
(Si, O)              31
(Cd, I)              24
(Zn, S)              24
(Si, C)              23
(S)                  14
                     ..
(Zr, P, W, O)         1
(Zr, Te, Br)          1
(Zr, V, O)            1
(Zr, Zn, P, S, O)     1
(Ac, Ga, Te)          1
Name: count, Length: 40652, dtype: int64

In [128]:
len(mp_trans_df['composition'].unique())

40652

In [5]:
exp_df.head()

Unnamed: 0,composition,band_gap
0,Ag0.1Cd0.8In2.1Te4,0.29
1,Ag0.25Cd0.5In2.25Te4,0.39
2,Ag0.2Cd0.75In2.1Te4,0.54
3,Ag0.4Cd0.2In2.4Te4,0.42
4,Ag0.4Cd0.5In2.2Te4,0.19


In [55]:
exp_df.shape

(2422, 2)

- Change the composition str to composition class, then match again

In [66]:
exp_trans_df = stc.featurize_dataframe(exp_df, 'composition', ignore_errors=True)

StrToComposition:   0%|          | 0/2422 [00:00<?, ?it/s]

In [95]:
exp_trans_df.rename(columns={'composition': 'composition_', 'composition_type': 'composition'}, inplace=True)

In [96]:
exp_trans_df.head()

Unnamed: 0,composition_,band_gap,composition
0,Ag0.1Cd0.8In2.1Te4,0.29,"(Ag, Cd, In, Te)"
1,Ag0.25Cd0.5In2.25Te4,0.39,"(Ag, Cd, In, Te)"
2,Ag0.2Cd0.75In2.1Te4,0.54,"(Ag, Cd, In, Te)"
3,Ag0.4Cd0.2In2.4Te4,0.42,"(Ag, Cd, In, Te)"
4,Ag0.4Cd0.5In2.2Te4,0.19,"(Ag, Cd, In, Te)"


In [107]:
exp_trans_df['composition'].value_counts()

composition
(Ge, Sn)           2
(In, As, Ga, P)    2
(Pb, Se, Sn)       2
(Pb, Se, Sn)       2
(Pb, Se, Sn)       2
                  ..
(Ag, Al, Te)       1
(Ag, As, S)        1
(Ag, As, Se)       1
(Ag, As, Te)       1
(Zn, In, Se)       1
Name: count, Length: 2413, dtype: int64

In [112]:
match_df = pd.merge(mp_trans_df, 
        exp_trans_df, on='composition', how='inner').rename(
    columns={
        'band_gap_x': 'mp_bandgap',
        'band_gap_y': 'exp_bandgap',
    }
)

In [113]:
match_df.shape

(1612, 6)

In [119]:
groups = match_df.groupby('composition')  

In [127]:
 # show the first 5 most sizes groups dataframes
for name, group in groups:
    if group.shape[0] > 10:
        display(group)


Unnamed: 0,composition__x,mp_bandgap,composition_type,composition,composition__y,exp_bandgap
314,Cd1I2,2.4195,"(Cd, I)","(Cd, I)",CdI2,3.8
320,Cd10I20,2.375475,"(Cd, I)","(Cd, I)",CdI2,3.8
321,Cd11I22,2.34564,"(Cd, I)","(Cd, I)",CdI2,3.8
323,Cd12I24,2.377973,"(Cd, I)","(Cd, I)",CdI2,3.8
324,Cd13I26,2.40514,"(Cd, I)","(Cd, I)",CdI2,3.8
325,Cd14I28,2.338283,"(Cd, I)","(Cd, I)",CdI2,3.8
326,Cd15I30,2.352333,"(Cd, I)","(Cd, I)",CdI2,3.8
327,Cd16I32,2.410933,"(Cd, I)","(Cd, I)",CdI2,3.8
328,Cd17I34,2.37065,"(Cd, I)","(Cd, I)",CdI2,3.8
329,Cd18I36,2.3743,"(Cd, I)","(Cd, I)",CdI2,3.8


Unnamed: 0,composition__x,mp_bandgap,composition_type,composition,composition__y,exp_bandgap
1541,Zn1S1,2.0192,"(Zn, S)","(Zn, S)",ZnS,3.704
1544,Zn10S10,2.02594,"(Zn, S)","(Zn, S)",ZnS,3.704
1546,Zn12S12,2.0224,"(Zn, S)","(Zn, S)",ZnS,3.704
1547,Zn14S14,1.73975,"(Zn, S)","(Zn, S)",ZnS,3.704
1549,Zn16S16,2.0283,"(Zn, S)","(Zn, S)",ZnS,3.704
1550,Zn18S18,2.00845,"(Zn, S)","(Zn, S)",ZnS,3.704
1562,Zn2S2,2.0701,"(Zn, S)","(Zn, S)",ZnS,3.704
1570,Zn20S20,2.021067,"(Zn, S)","(Zn, S)",ZnS,3.704
1571,Zn22S22,2.011233,"(Zn, S)","(Zn, S)",ZnS,3.704
1574,Zn24S24,2.022154,"(Zn, S)","(Zn, S)",ZnS,3.704
