In [24]:
cd

/Users/emiljaffal


In [46]:
import pandas as pd

file_path = "desktop/data.xlsx"
data = pd.read_excel(file_path)
data

Unnamed: 0,Formula,Entry,Entry prototype,Structure class,Inv. center,"Unit cell volume, A^3",Density (calc.) [Mg m‰…]
0,CeGa,261037,1,1,1,210.590,6.62
1,CeGa,526690,1,1,1,212.790,6.55
2,CeGa,450659,1,1,1,212.790,6.55
3,CeGa,450525,1,1,1,215.020,6.48
4,CeGa,528215,1,1,1,215.030,6.48
...,...,...,...,...,...,...,...
1447,NiPt,1627941,10,4,1,26.500,15.91
1448,NiPt,455847,10,4,1,26.473,15.92
1449,SnYb,450594,10,4,1,54.147,8.95
1450,SnYb,1823352,10,4,1,54.800,8.84


## How to train chemical formula only datasets

For each chemical formula, we can obtain 438 descriptors consisting of features such as average electronegativity, average boiling points of elements etc. An example of getting descriptors is shown below:


In [47]:
import numpy as np
from jarvis.core.composition import Composition
from jarvis.core.specie import Specie
from jarvis.ai.pkgs.lgbm.regression import regression
from jarvis.ai.descriptors.cfid import get_chem_only_descriptors

formula_unitcell = data[['Formula', 'Unit cell volume, A^3']]

# Convert to NumPy array
X = []
Y = []
IDs = []
for ii, i in enumerate(formula_unitcell):
    X.append(get_chem_only_descriptors(i[0]))
    Y.append(i[1])
    IDs.append(ii)

X = np.array(X)
Y = np.array(Y).reshape(-1, 1)
IDs = np.array(IDs)

In [48]:
X

array([[['42.515', '85.03255', '11.48', '-7.798403199999999', '1.0',
         '1.5756031999999998', '2.0', '212.78211583999996',
         '126.35379061371842', '-50.03', '0.2785', '13.054400000000001',
         '5.093', '0.0', '3.4723', '1932.490974729242', '7.26', '96.23',
         '0.0', '7.2', '13.4', '4.5', '0.0', '1.5', '0.0', '2.9', '4.5',
         '15.1', '50.25', '3.28255', '-14.142799999999998', '-11.1723',
         '-7.219999999999999', '17.42025', '20.9228',
         '0.9170665458873933', '-0.05700000000000005', '2.047',
         '3.2463', '1.618', '62.314762496', '22.2784032',
         '-3.2523000000000004', '0.02631800129391927',
         '0.14285714285714285', '-1.0', '9.7044996',
         '15.294285714285715', '4392.156862745098',
         '2.8176052185269973', '-53.5023', '5.0', '10.7',
         '196.078431372549', '53.53', '0.50255', '22.4984032',
         '37.9968064', '0.0051', '2.21686', '17.4228', '89.01',
         '-0.4723', '13.922799999999999', '16.8658', '-3.97

### Names of the 1557 chemical force-field inspired descriptors

### Further breakdown here: 
    https://jarvis-tools.readthedocs.io/en/master/autoapi/jarvis/ai/descriptors/cfid/index.html 

In [50]:
import jarvis
jarvis.ai.descriptors.cfid.feat_names()

['bp_mult_atom_rad',
 'hfus_add_bp',
 'elec_aff_mult_voro_coord',
 'mol_vol_subs_atom_mass',
 'is_halogen',
 'atom_mass_subs_first_ion_en',
 'row',
 'mol_vol_mult_atom_mass',
 'voro_coord_divi_therm_cond',
 'voro_coord_subs_mp',
 'polzbl_mult_atom_rad',
 'elec_aff_mult_X',
 'GV',
 'nfunfill',
 'voro_coord_subs_therm_cond',
 'mp_divi_therm_cond',
 'elec_aff_add_X',
 'mol_vol_add_bp',
 'C-9',
 'C-8',
 'C-7',
 'C-6',
 'C-5',
 'C-4',
 'C-3',
 'C-2',
 'C-1',
 'C-0',
 'mp_subs_elec_aff',
 'hfus_add_elec_aff',
 'elec_aff_subs_first_ion_en',
 'therm_cond_subs_mol_vol',
 'X_subs_mol_vol',
 'first_ion_en_subs_hfus',
 'first_ion_en_add_voro_coord',
 'first_ion_en_divi_atom_mass',
 'atom_rad_subs_polzbl',
 'me1',
 'me3',
 'me2',
 'elec_aff_mult_atom_mass',
 'elec_aff_add_atom_mass',
 'therm_cond_subs_elec_aff',
 'atom_rad_divi_atom_mass',
 'atom_rad_divi_voro_coord',
 'max_oxid_s',
 'polzbl_mult_first_ion_en',
 'mp_divi_voro_coord',
 'mol_vol_divi_hfus',
 'mp_divi_atom_mass',
 'therm_cond_subs_mp'

In [51]:
# Documentation says this should exist...
jarvis.ai.descriptors.elemental.get_element_fraction_desc(formula='SiO2', max_nelements=103)

AttributeError: module 'jarvis.ai.descriptors' has no attribute 'elemental'

## Using with CBFV

***Authors: Steven Kauwe, Andrew Falkowski, Anthony Wang***

Following tutorial here:
    
    https://pypi.org/project/CBFV/

Repo here:

    https://github.com/Kaaiian/CBFV/blob/master/README.md

In [37]:
formula_unitcell

Unnamed: 0,Formula,"Unit cell volume, A^3"
0,CeGa,210.590
1,CeGa,212.790
2,CeGa,212.790
3,CeGa,215.020
4,CeGa,215.030
...,...,...
1447,NiPt,26.500
1448,NiPt,26.473
1449,SnYb,54.147
1450,SnYb,54.800


In [40]:
import pandas as pd

data = {
    'formula': formula_unitcell['Formula'].tolist(),
    'target': formula_unitcell['Unit cell volume, A^3'].tolist()
}

df = pd.DataFrame(data)
print(df)

     formula   target
0       CeGa  210.590
1       CeGa  212.790
2       CeGa  212.790
3       CeGa  215.020
4       CeGa  215.030
...      ...      ...
1447    NiPt   26.500
1448    NiPt   26.473
1449    SnYb   54.147
1450    SnYb   54.800
1451    SnYb   54.000

[1452 rows x 2 columns]


In [44]:
from CBFV import composition
X, y, formulae, skipped = composition.generate_features(formula_unitcell,
                                                        elem_prop='JARVIS',
                                                        drop_duplicates=False,
                                                        extend_features=True,
                                                        sum_feat=True)

ValueError: list.remove(x): x not in list

## Discrepancy with # of features...

3066 with CBFV vs. 1557 + unknown from element & coloumb matrix features

see documentation: 
    
    https://jarvis-tools.readthedocs.io/en/master/autoapi/jarvis/ai/descriptors/index.html

In [68]:
X #features

Unnamed: 0,sum_C-0,sum_C-1,sum_C-10,sum_C-11,sum_C-12,sum_C-13,sum_C-14,sum_C-15,sum_C-16,sum_C-17,...,mode_voro_coord_subs_atom_mass,mode_voro_coord_subs_atom_rad,mode_voro_coord_subs_bp,mode_voro_coord_subs_elec_aff,mode_voro_coord_subs_first_ion_en,mode_voro_coord_subs_hfus,mode_voro_coord_subs_mol_vol,mode_voro_coord_subs_mp,mode_voro_coord_subs_polzbl,mode_voro_coord_subs_therm_cond
0,749.1,361.7,0.0,0.0,333.0,333.0,797.5,0.0,0.0,0.0,...,-86.0,8.65,-4526.0,9.494,3.25381,9.785,1.68,-2418.0,-2.4,-39.0
1,175.8,124.2,0.0,0.0,124.2,124.2,175.8,0.0,0.0,0.0,...,-150.5,10.25,-3188.0,10.816,4.27362,11.864,-7.01,-1668.0,-12.5,-388.0
2,188.5,176.2,0.0,0.0,106.9,106.9,135.4,0.0,0.0,0.0,...,-102.411,8.45,-1030.0,10.0,1.00618,9.9379,-3.0,-584.22,2.68,-87.0


In [69]:
y #target

0    248.5390
1     66.8444
2     91.5034
Name: target, dtype: float64

In [70]:
formulae #Formulas

0     Tc1V1
1    Cu1Dy1
2     Cd3N2
Name: formula, dtype: object