In [1]:
from matminer.datasets import load_dataset
import matminer
import pymatgen
import ast
import pandas as pd
import numpy as np
matminer.datasets.get_available_datasets()

boltztrap_mp: Effective mass and thermoelectric properties of 8924 compounds in The  Materials Project database that are calculated by the BoltzTraP software package run on the GGA-PBE or GGA+U density functional theory calculation results. The properties are reported at the temperature of 300 Kelvin and the carrier concentration of 1e18 1/cm3.

brgoch_superhard_training: 2574 materials used for training regressors that predict shear and bulk modulus.

castelli_perovskites: 18,928 perovskites generated with ABX combinatorics, calculating gllbsc band gap and pbe structure, and also reporting absolute band edge positions and heat of formation.

citrine_thermal_conductivity: Thermal conductivity of 872 compounds measured experimentally and retrieved from Citrine database from various references. The reported values are measured at various temperatures of which 295 are at room temperature.

dielectric_constant: 1,056 structures with dielectric properties, calculated with DFPT-PBE.

double_

['boltztrap_mp',
 'brgoch_superhard_training',
 'castelli_perovskites',
 'citrine_thermal_conductivity',
 'dielectric_constant',
 'double_perovskites_gap',
 'double_perovskites_gap_lumo',
 'elastic_tensor_2015',
 'expt_formation_enthalpy',
 'expt_formation_enthalpy_kingsbury',
 'expt_gap',
 'expt_gap_kingsbury',
 'flla',
 'glass_binary',
 'glass_binary_v2',
 'glass_ternary_hipt',
 'glass_ternary_landolt',
 'heusler_magnetic',
 'jarvis_dft_2d',
 'jarvis_dft_3d',
 'jarvis_ml_dft_training',
 'm2ax',
 'matbench_dielectric',
 'matbench_expt_gap',
 'matbench_expt_is_metal',
 'matbench_glass',
 'matbench_jdft2d',
 'matbench_log_gvrh',
 'matbench_log_kvrh',
 'matbench_mp_e_form',
 'matbench_mp_gap',
 'matbench_mp_is_metal',
 'matbench_perovskites',
 'matbench_phonons',
 'matbench_steels',
 'mp_all_20181018',
 'mp_nostruct_20181018',
 'phonon_dielectric_mp',
 'piezoelectric_tensor',
 'ricci_boltztrap_mp_tabular',
 'steel_strength',
 'wolverton_oxides']

In [2]:
# data set load

data = load_dataset("glass_ternary_hipt")

Reading file /opt/netapps/anaconda/2019.10/lib/python3.7/site-packages/matminer/datasets/glass_ternary_hipt.json.gz: 0it [00:00, ?it/s]0, ?it/s]


In [3]:
data

Unnamed: 0,formula,system,processing,phase,gfa
0,Co8.040000Fe16.187000Zr75.773000,CoFeZr,sputtering,CR,0
1,Co7.831000Fe14.814000Zr77.355000,CoFeZr,sputtering,CR,0
2,Co7.613000Fe13.548000Zr78.839000,CoFeZr,sputtering,CR,0
3,Co7.387000Fe12.380000Zr80.233000,CoFeZr,sputtering,CR,0
4,Co7.157000Fe11.308000Zr81.535000,CoFeZr,sputtering,CR,0
...,...,...,...,...,...
5165,Fe58.065000Ti29.314000Nb12.621000,FeTiNb,sputtering,CR,0
5166,Fe58.032000Ti28.646000Nb13.321000,FeTiNb,sputtering,CR,0
5167,Fe57.965000Ti27.980000Nb14.056000,FeTiNb,sputtering,CR,0
5168,Fe57.861000Ti27.313000Nb14.826000,FeTiNb,sputtering,CR,0


In [4]:
df = data
df.info()
df['formula']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5170 entries, 0 to 5169
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   formula     5170 non-null   object
 1   system      5170 non-null   object
 2   processing  5170 non-null   object
 3   phase       5170 non-null   object
 4   gfa         5170 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 242.3+ KB


0        Co8.040000Fe16.187000Zr75.773000
1        Co7.831000Fe14.814000Zr77.355000
2        Co7.613000Fe13.548000Zr78.839000
3        Co7.387000Fe12.380000Zr80.233000
4        Co7.157000Fe11.308000Zr81.535000
                      ...                
5165    Fe58.065000Ti29.314000Nb12.621000
5166    Fe58.032000Ti28.646000Nb13.321000
5167    Fe57.965000Ti27.980000Nb14.056000
5168    Fe57.861000Ti27.313000Nb14.826000
5169    Fe57.720000Ti26.647000Nb15.632000
Name: formula, Length: 5170, dtype: object

In [5]:
df

Unnamed: 0,formula,system,processing,phase,gfa
0,Co8.040000Fe16.187000Zr75.773000,CoFeZr,sputtering,CR,0
1,Co7.831000Fe14.814000Zr77.355000,CoFeZr,sputtering,CR,0
2,Co7.613000Fe13.548000Zr78.839000,CoFeZr,sputtering,CR,0
3,Co7.387000Fe12.380000Zr80.233000,CoFeZr,sputtering,CR,0
4,Co7.157000Fe11.308000Zr81.535000,CoFeZr,sputtering,CR,0
...,...,...,...,...,...
5165,Fe58.065000Ti29.314000Nb12.621000,FeTiNb,sputtering,CR,0
5166,Fe58.032000Ti28.646000Nb13.321000,FeTiNb,sputtering,CR,0
5167,Fe57.965000Ti27.980000Nb14.056000,FeTiNb,sputtering,CR,0
5168,Fe57.861000Ti27.313000Nb14.826000,FeTiNb,sputtering,CR,0


### All function for sepration of clean formula 

In [6]:
import re

def extract_element_concentration(formula):
    """ formula convert simple format"""
    element_concentration_pattern = re.compile(r"([A-Z][a-z]*)([0-9]*)")
    elem_conc_pairs = element_concentration_pattern.findall(formula)
    result = [(elem, conc) if conc else (elem, '1') for elem, conc in elem_conc_pairs]
    return result

def process_chemical_formula(formula):
    """ formula convert in set"""
    # Handle parentheses
    formula = re.sub(r"\((.*?)\)([0-9]*)", lambda x: x.group(1)*int(x.group(2) or 1), formula)
    return extract_element_concentration(formula)
def consolidate_element_quantities(elem_conc_pairs):
    """ formula convert in list"""
    consolidated_dict = {}
    
    for elem, conc in elem_conc_pairs:
        if elem in consolidated_dict:
            consolidated_dict[elem] += int(conc)
        else:
            consolidated_dict[elem] = int(conc)
    
    consolidated_list = [(elem, str(conc)) for elem, conc in consolidated_dict.items()]
    return consolidated_list
def formula_list(clean_formula_1):
    """formula convert process in amount"""
    elem_conc_pairs = process_chemical_formula(clean_formula_1)

    element_concentration_pairs = consolidate_element_quantities(elem_conc_pairs)

    element_concentration_dict = {element: float(concentration) for element, concentration in element_concentration_pairs}
    return element_concentration_dict

def composition(composition_string):
    element_dict = {}

# Iterate over the string and extract elements and values
    i = 0
    while i < len(composition_string):
        element = ''
        value = ''
    
    # Extract element symbol
        while i < len(composition_string) and not composition_string[i].isdigit():
            element += composition_string[i]
            i += 1
    
    # Extract element value
        while i < len(composition_string) and (composition_string[i].isdigit() or composition_string[i] == '.'):
            value += composition_string[i]
            i += 1
    
    # Add the element and its value to the dictionary
        element_dict[element] = float(value)
    return element_dict

In [7]:
df['composition'] = df['formula'].apply(composition) # applying function for selected columns

In [8]:
df['composition1'] = df['system'].apply(formula_list)

In [9]:
df

Unnamed: 0,formula,system,processing,phase,gfa,composition,composition1
0,Co8.040000Fe16.187000Zr75.773000,CoFeZr,sputtering,CR,0,"{'Co': 8.04, 'Fe': 16.187, 'Zr': 75.773}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}"
1,Co7.831000Fe14.814000Zr77.355000,CoFeZr,sputtering,CR,0,"{'Co': 7.831, 'Fe': 14.814, 'Zr': 77.355}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}"
2,Co7.613000Fe13.548000Zr78.839000,CoFeZr,sputtering,CR,0,"{'Co': 7.613, 'Fe': 13.548, 'Zr': 78.839}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}"
3,Co7.387000Fe12.380000Zr80.233000,CoFeZr,sputtering,CR,0,"{'Co': 7.387, 'Fe': 12.38, 'Zr': 80.233}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}"
4,Co7.157000Fe11.308000Zr81.535000,CoFeZr,sputtering,CR,0,"{'Co': 7.157, 'Fe': 11.308, 'Zr': 81.535}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}"
...,...,...,...,...,...,...,...
5165,Fe58.065000Ti29.314000Nb12.621000,FeTiNb,sputtering,CR,0,"{'Fe': 58.065, 'Ti': 29.314, 'Nb': 12.621}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}"
5166,Fe58.032000Ti28.646000Nb13.321000,FeTiNb,sputtering,CR,0,"{'Fe': 58.032, 'Ti': 28.646, 'Nb': 13.321}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}"
5167,Fe57.965000Ti27.980000Nb14.056000,FeTiNb,sputtering,CR,0,"{'Fe': 57.965, 'Ti': 27.98, 'Nb': 14.056}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}"
5168,Fe57.861000Ti27.313000Nb14.826000,FeTiNb,sputtering,CR,0,"{'Fe': 57.861, 'Ti': 27.313, 'Nb': 14.826}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}"


### counting of elements

In [10]:
all_elements = set()
for comp_dict in df["composition"]:
    all_elements.update(comp_dict.keys())

In [11]:
all_elements

{'Co', 'Fe', 'Nb', 'Ti', 'V', 'Zr'}

In [12]:
X_formula = df


In [13]:
for elem in all_elements:
    X_formula[elem] = X_formula["composition"].map(lambda comp_dict: comp_dict.get(elem,0.0))

In [14]:
formula_df = X_formula.iloc[:,7:]

In [15]:
formula_df # new processed data 

Unnamed: 0,Co,Zr,V,Fe,Nb,Ti
0,8.040,75.773,0.0,16.187,0.000,0.000
1,7.831,77.355,0.0,14.814,0.000,0.000
2,7.613,78.839,0.0,13.548,0.000,0.000
3,7.387,80.233,0.0,12.380,0.000,0.000
4,7.157,81.535,0.0,11.308,0.000,0.000
...,...,...,...,...,...,...
5165,0.000,0.000,0.0,58.065,12.621,29.314
5166,0.000,0.000,0.0,58.032,13.321,28.646
5167,0.000,0.000,0.0,57.965,14.056,27.980
5168,0.000,0.000,0.0,57.861,14.826,27.313


In [16]:
X_system = df

In [17]:
for elem in all_elements:
    X_system[elem] = X_system["composition1"].map(lambda comp_dict: comp_dict.get(elem,0.0))

In [18]:
sys_df = X_formula.iloc[:,7:]

In [19]:
sys_df # new data for system (e.g (Fe,Co,Zr) --> (0,1,1,0,1,0))

Unnamed: 0,Co,Zr,V,Fe,Nb,Ti
0,1.0,1.0,0.0,1.0,0.0,0.0
1,1.0,1.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0
3,1.0,1.0,0.0,1.0,0.0,0.0
4,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...
5165,0.0,0.0,0.0,1.0,1.0,1.0
5166,0.0,0.0,0.0,1.0,1.0,1.0
5167,0.0,0.0,0.0,1.0,1.0,1.0
5168,0.0,0.0,0.0,1.0,1.0,1.0


In [20]:
formula_df

Unnamed: 0,Co,Zr,V,Fe,Nb,Ti
0,8.040,75.773,0.0,16.187,0.000,0.000
1,7.831,77.355,0.0,14.814,0.000,0.000
2,7.613,78.839,0.0,13.548,0.000,0.000
3,7.387,80.233,0.0,12.380,0.000,0.000
4,7.157,81.535,0.0,11.308,0.000,0.000
...,...,...,...,...,...,...
5165,0.000,0.000,0.0,58.065,12.621,29.314
5166,0.000,0.000,0.0,58.032,13.321,28.646
5167,0.000,0.000,0.0,57.965,14.056,27.980
5168,0.000,0.000,0.0,57.861,14.826,27.313


In [21]:
formula_df[all_elements].sum(axis=1)

0       100.000
1       100.000
2       100.000
3       100.000
4       100.000
         ...   
5165    100.000
5166     99.999
5167    100.001
5168    100.000
5169     99.999
Length: 5170, dtype: float64

In [22]:
processed_formula_df = pd.concat([formula_df[all_elements],df[["processing","phase","gfa"]]],axis=1)

In [23]:
processed_formula_df # combind with old data

Unnamed: 0,Co,Zr,V,Fe,Nb,Ti,processing,phase,gfa
0,8.040,75.773,0.0,16.187,0.000,0.000,sputtering,CR,0
1,7.831,77.355,0.0,14.814,0.000,0.000,sputtering,CR,0
2,7.613,78.839,0.0,13.548,0.000,0.000,sputtering,CR,0
3,7.387,80.233,0.0,12.380,0.000,0.000,sputtering,CR,0
4,7.157,81.535,0.0,11.308,0.000,0.000,sputtering,CR,0
...,...,...,...,...,...,...,...,...,...
5165,0.000,0.000,0.0,58.065,12.621,29.314,sputtering,CR,0
5166,0.000,0.000,0.0,58.032,13.321,28.646,sputtering,CR,0
5167,0.000,0.000,0.0,57.965,14.056,27.980,sputtering,CR,0
5168,0.000,0.000,0.0,57.861,14.826,27.313,sputtering,CR,0


In [24]:
processed_sys_df = pd.concat([sys_df[all_elements],df[["processing","phase","gfa"]]],axis=1)

In [25]:
processed_sys_df # combined with old data

Unnamed: 0,Co,Zr,V,Fe,Nb,Ti,processing,phase,gfa
0,1.0,1.0,0.0,1.0,0.0,0.0,sputtering,CR,0
1,1.0,1.0,0.0,1.0,0.0,0.0,sputtering,CR,0
2,1.0,1.0,0.0,1.0,0.0,0.0,sputtering,CR,0
3,1.0,1.0,0.0,1.0,0.0,0.0,sputtering,CR,0
4,1.0,1.0,0.0,1.0,0.0,0.0,sputtering,CR,0
...,...,...,...,...,...,...,...,...,...
5165,0.0,0.0,0.0,1.0,1.0,1.0,sputtering,CR,0
5166,0.0,0.0,0.0,1.0,1.0,1.0,sputtering,CR,0
5167,0.0,0.0,0.0,1.0,1.0,1.0,sputtering,CR,0
5168,0.0,0.0,0.0,1.0,1.0,1.0,sputtering,CR,0


In [26]:
#save for all data into the CSV

In [27]:
processed_sys_df.to_csv("processed_sys_df.csv",index=None)
processed_formula_df.to_csv("processed_formula_df.csv",index=None)

### featurization with matminer 

In [28]:
df

Unnamed: 0,formula,system,processing,phase,gfa,composition,composition1,Co,Zr,V,Fe,Nb,Ti
0,Co8.040000Fe16.187000Zr75.773000,CoFeZr,sputtering,CR,0,"{'Co': 8.04, 'Fe': 16.187, 'Zr': 75.773}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0
1,Co7.831000Fe14.814000Zr77.355000,CoFeZr,sputtering,CR,0,"{'Co': 7.831, 'Fe': 14.814, 'Zr': 77.355}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0
2,Co7.613000Fe13.548000Zr78.839000,CoFeZr,sputtering,CR,0,"{'Co': 7.613, 'Fe': 13.548, 'Zr': 78.839}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0
3,Co7.387000Fe12.380000Zr80.233000,CoFeZr,sputtering,CR,0,"{'Co': 7.387, 'Fe': 12.38, 'Zr': 80.233}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0
4,Co7.157000Fe11.308000Zr81.535000,CoFeZr,sputtering,CR,0,"{'Co': 7.157, 'Fe': 11.308, 'Zr': 81.535}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5165,Fe58.065000Ti29.314000Nb12.621000,FeTiNb,sputtering,CR,0,"{'Fe': 58.065, 'Ti': 29.314, 'Nb': 12.621}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}",0.0,0.0,0.0,1.0,1.0,1.0
5166,Fe58.032000Ti28.646000Nb13.321000,FeTiNb,sputtering,CR,0,"{'Fe': 58.032, 'Ti': 28.646, 'Nb': 13.321}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}",0.0,0.0,0.0,1.0,1.0,1.0
5167,Fe57.965000Ti27.980000Nb14.056000,FeTiNb,sputtering,CR,0,"{'Fe': 57.965, 'Ti': 27.98, 'Nb': 14.056}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}",0.0,0.0,0.0,1.0,1.0,1.0
5168,Fe57.861000Ti27.313000Nb14.826000,FeTiNb,sputtering,CR,0,"{'Fe': 57.861, 'Ti': 27.313, 'Nb': 14.826}","{'Fe': 1.0, 'Ti': 1.0, 'Nb': 1.0}",0.0,0.0,0.0,1.0,1.0,1.0


In [29]:
from matminer.featurizers.conversions import StrToComposition

composition_df = StrToComposition(target_col_id="elements").featurize_dataframe(df[["formula"]], col_id = "formula" )
composition_df

Decoding objects from /opt/netapps/anaconda/2019.10/lib/python3.7/site-packages/matminer/datasets/glass_ternary_hipt.json.gz: 0it [00:01, ?it/s]


StrToComposition:   0%|          | 0/5170 [00:00<?, ?it/s]

Unnamed: 0,formula,elements
0,Co8.040000Fe16.187000Zr75.773000,"(Co, Fe, Zr)"
1,Co7.831000Fe14.814000Zr77.355000,"(Co, Fe, Zr)"
2,Co7.613000Fe13.548000Zr78.839000,"(Co, Fe, Zr)"
3,Co7.387000Fe12.380000Zr80.233000,"(Co, Fe, Zr)"
4,Co7.157000Fe11.308000Zr81.535000,"(Co, Fe, Zr)"
...,...,...
5165,Fe58.065000Ti29.314000Nb12.621000,"(Fe, Ti, Nb)"
5166,Fe58.032000Ti28.646000Nb13.321000,"(Fe, Ti, Nb)"
5167,Fe57.965000Ti27.980000Nb14.056000,"(Fe, Ti, Nb)"
5168,Fe57.861000Ti27.313000Nb14.826000,"(Fe, Ti, Nb)"


In [30]:
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf

f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("matminer")])
X_matminer = f.featurize_many(composition_df['elements'])
X_matminer=pd.DataFrame(X_matminer, columns=f.feature_labels())
#print(X_matminer.columns)

f_X = X_matminer[['PymatgenData mean thermal_conductivity','PymatgenData mean melting_point']]

combined_df = pd.concat([df, f_X], axis=1)
combined_df.head()

MultipleFeaturizer:   0%|          | 0/5170 [00:00<?, ?it/s]

Unnamed: 0,formula,system,processing,phase,gfa,composition,composition1,Co,Zr,V,Fe,Nb,Ti,PymatgenData mean thermal_conductivity,PymatgenData mean melting_point
0,Co8.040000Fe16.187000Zr75.773000,CoFeZr,sputtering,CR,0,"{'Co': 8.04, 'Fe': 16.187, 'Zr': 75.773}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0,38.41739,2047.74321
1,Co7.831000Fe14.814000Zr77.355000,CoFeZr,sputtering,CR,0,"{'Co': 7.831, 'Fe': 14.814, 'Zr': 77.355}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0,37.47385,2052.84802
2,Co7.613000Fe13.548000Zr78.839000,CoFeZr,sputtering,CR,0,"{'Co': 7.613, 'Fe': 13.548, 'Zr': 78.839}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0,36.58437,2057.64604
3,Co7.387000Fe12.380000Zr80.233000,CoFeZr,sputtering,CR,0,"{'Co': 7.387, 'Fe': 12.38, 'Zr': 80.233}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0,35.74459,2062.1622
4,Co7.157000Fe11.308000Zr81.535000,CoFeZr,sputtering,CR,0,"{'Co': 7.157, 'Fe': 11.308, 'Zr': 81.535}","{'Co': 1.0, 'Fe': 1.0, 'Zr': 1.0}",1.0,1.0,0.0,1.0,0.0,0.0,34.95645,2066.38844


In [31]:
combined_df.to_csv("featurization_glass_ternary_hipt.csv",index=None) # featurization data seved into csv

#### periodic_data

In [32]:
df = pd.read_csv("periodic_data.csv", skipinitialspace=True)


In [33]:
df["symbol"] =  df["symbol"].map(lambda s:s.strip())
number_df = df.select_dtypes(include=['float64','int'])

In [34]:
df

Unnamed: 0,atomicNumber,symbol,name,atomicMass,cpkHexColor,electronicConfiguration,electronegativity,atomicRadius,ionRadius,vanDelWaalsRadius,ionizationEnergy,electronAffinity,oxidationStates,standardState,bondingType,meltingPoint,boilingPoint,density,groupBlock,yearDiscovered
0,1,H,Hydrogen,1.00794(4),FFFFFF,1s1,2.20,37.0,,120.0,1312.0,-73.0,"-1, 1",gas,diatomic,14.0,20.0,0.000090,nonmetal,1766
1,2,He,Helium,4.002602(2),D9FFFF,1s2,,32.0,,140.0,2372.0,0.0,,gas,atomic,,4.0,0.000179,noble gas,1868
2,3,Li,Lithium,6.941(2),CC80FF,[He] 2s1,0.98,134.0,76 (+1),182.0,520.0,-60.0,1,solid,metallic,454.0,1615.0,0.535000,alkali metal,1817
3,4,Be,Beryllium,9.012182(3),C2FF00,[He] 2s2,1.57,90.0,45 (+2),,900.0,0.0,2,solid,metallic,1560.0,2743.0,1.848000,alkaline earth metal,1798
4,5,B,Boron,10.811(7),FFB5B5,[He] 2s2 2p1,2.04,82.0,27 (+3),,801.0,-27.0,"1, 2, 3",solid,covalent network,2348.0,4273.0,2.460000,metalloid,1807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,114,Fl,Flerovium,[289],,[Rn] 5f14 6d10 7s2 7p2,,,,,,,,,,,,,post-transition metal,1998
114,115,Mc,Moscovium,[288],,[Rn] 5f14 6d10 7s2 7p3,,,,,,,,,,,,,post-transition metal,2003
115,116,Lv,Livermorium,[293],,[Rn] 5f14 6d10 7s2 7p4,,,,,,,,,,,,,post-transition metal,2000
116,117,Ts,Tennessine,[294],,[Rn] 5f14 6d10 7s2 7p5,,,,,,,,,,,,,post-transition metal,2010


In [35]:
number_df.index=df["symbol"]

In [36]:
number_df.head()

Unnamed: 0_level_0,atomicNumber,electronegativity,atomicRadius,vanDelWaalsRadius,ionizationEnergy,electronAffinity,meltingPoint,boilingPoint,density
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
H,1,2.2,37.0,120.0,1312.0,-73.0,14.0,20.0,9e-05
He,2,,32.0,140.0,2372.0,0.0,,4.0,0.000179
Li,3,0.98,134.0,182.0,520.0,-60.0,454.0,1615.0,0.535
Be,4,1.57,90.0,,900.0,0.0,1560.0,2743.0,1.848
B,5,2.04,82.0,,801.0,-27.0,2348.0,4273.0,2.46


In [37]:
missed_values_properties = number_df.loc[all_elements].isna().any()
missed_values_properties

atomicNumber         False
electronegativity    False
atomicRadius         False
vanDelWaalsRadius     True
ionizationEnergy     False
electronAffinity     False
meltingPoint         False
boilingPoint         False
density              False
dtype: bool

In [38]:
available_elemental_properties = missed_values_properties[~missed_values_properties].index.tolist()
available_elemental_properties

['atomicNumber',
 'electronegativity',
 'atomicRadius',
 'ionizationEnergy',
 'electronAffinity',
 'meltingPoint',
 'boilingPoint',
 'density']

In [39]:
elemental_property_name = available_elemental_properties[0]
elemental_property_name

'atomicNumber'

In [40]:
prop_values=number_df.loc[all_elements,elemental_property_name]
prop_values

symbol
Co    27
Zr    40
V     23
Fe    26
Nb    41
Ti    22
Name: atomicNumber, dtype: int64

In [41]:
X = formula_df[all_elements]

In [42]:
X.mul(prop_values, axis=1)

Unnamed: 0,Co,Zr,V,Fe,Nb,Ti
0,217.080,3030.92,0.0,420.862,0.000,0.000
1,211.437,3094.20,0.0,385.164,0.000,0.000
2,205.551,3153.56,0.0,352.248,0.000,0.000
3,199.449,3209.32,0.0,321.880,0.000,0.000
4,193.239,3261.40,0.0,294.008,0.000,0.000
...,...,...,...,...,...,...
5165,0.000,0.00,0.0,1509.690,517.461,644.908
5166,0.000,0.00,0.0,1508.832,546.161,630.212
5167,0.000,0.00,0.0,1507.090,576.296,615.560
5168,0.000,0.00,0.0,1504.386,607.866,600.886


In [43]:
Y = sys_df[all_elements]

In [44]:
Y.mul(prop_values, axis=1).sum()

Co    104328.0
Zr    154560.0
V      29624.0
Fe     67626.0
Nb     53546.0
Ti     56914.0
dtype: float64

In [45]:
X.mul(prop_values, axis=1).sum(axis=1)

0       3668.862
1       3690.801
2       3711.359
3       3730.649
4       3748.647
          ...   
5165    2672.059
5166    2685.205
5167    2698.946
5168    2713.138
5169    2727.866
Length: 5170, dtype: float64

In [46]:
Y.mul(prop_values, axis=1).sum(axis=1)

0       93.0
1       93.0
2       93.0
3       93.0
4       93.0
        ... 
5165    89.0
5166    89.0
5167    89.0
5168    89.0
5169    89.0
Length: 5170, dtype: float64

In [47]:
formula_composition_features_dict={}
for elemental_property_name in available_elemental_properties:
    print(elemental_property_name)
    prop_values=number_df.loc[all_elements,elemental_property_name]
    formula_composition_features_dict["average_"+elemental_property_name] = X.mul(prop_values, axis=1).sum(axis=1)

atomicNumber
electronegativity
atomicRadius
ionizationEnergy
electronAffinity
meltingPoint
boilingPoint
density


In [48]:
formula_composition_features_df = pd.DataFrame(formula_composition_features_dict)
formula_composition_features_df

Unnamed: 0,average_atomicNumber,average_electronegativity,average_atomicRadius,average_ionizationEnergy,average_electronAffinity,average_meltingPoint,average_boilingPoint,average_density
0,3668.862,145.51550,14250.819,66955.801,-3880.245,204774.321,431227.244,692.370441
1,3690.801,144.71405,14286.996,66761.842,-3909.763,205284.802,433662.386,689.999741
2,3711.359,143.96115,14320.910,66579.964,-3936.399,205764.604,435945.230,687.753381
3,3730.649,143.25285,14352.746,66409.180,-3960.401,206216.220,438088.226,685.621483
4,3748.647,142.59035,14382.462,66249.724,-3981.911,206638.844,440088.542,683.610877
...,...,...,...,...,...,...,...,...
5165,2672.059,171.59611,12973.906,71850.413,-2248.958,196761.939,349653.107,697.483978
5166,2685.205,171.62700,12974.833,71841.422,-2303.286,197330.588,350683.505,700.212460
5167,2698.946,171.65475,12976.577,71830.627,-2360.096,197937.795,351790.062,702.982190
5168,2713.138,171.66925,12978.355,71813.762,-2419.316,198572.304,352952.696,705.756025


In [49]:
sys_composition_features_dict={}
for elemental_property_name in available_elemental_properties:
    print(elemental_property_name)
    prop_values=number_df.loc[all_elements,elemental_property_name]
    sys_composition_features_dict["average_"+elemental_property_name] = Y.mul(prop_values, axis=1).sum(axis=1)

atomicNumber
electronegativity
atomicRadius
ionizationEnergy
electronAffinity
meltingPoint
boilingPoint
density


In [50]:
sys_composition_features_df = pd.DataFrame(sys_composition_features_dict)
sys_composition_features_df

Unnamed: 0,average_atomicNumber,average_electronegativity,average_atomicRadius,average_ionizationEnergy,average_electronAffinity,average_meltingPoint,average_boilingPoint,average_density
0,93.0,5.04,399.0,2163.0,-121.0,5707.0,11016.0,23.285
1,93.0,5.04,399.0,2163.0,-121.0,5707.0,11016.0,23.285
2,93.0,5.04,399.0,2163.0,-121.0,5707.0,11016.0,23.285
3,93.0,5.04,399.0,2163.0,-121.0,5707.0,11016.0,23.285
4,93.0,5.04,399.0,2163.0,-121.0,5707.0,11016.0,23.285
...,...,...,...,...,...,...,...,...
5165,89.0,4.97,398.0,2074.0,-110.0,6502.0,11711.0,20.951
5166,89.0,4.97,398.0,2074.0,-110.0,6502.0,11711.0,20.951
5167,89.0,4.97,398.0,2074.0,-110.0,6502.0,11711.0,20.951
5168,89.0,4.97,398.0,2074.0,-110.0,6502.0,11711.0,20.951


In [51]:
df = data

from matminer.featurizers.conversions import StrToComposition

composition_df = StrToComposition(target_col_id="composition").featurize_dataframe(df[["formula"]], col_id = "formula" )

StrToComposition:   0%|          | 0/5170 [00:00<?, ?it/s]

In [52]:
composition_df

Unnamed: 0,formula,composition
0,Co8.040000Fe16.187000Zr75.773000,"(Co, Fe, Zr)"
1,Co7.831000Fe14.814000Zr77.355000,"(Co, Fe, Zr)"
2,Co7.613000Fe13.548000Zr78.839000,"(Co, Fe, Zr)"
3,Co7.387000Fe12.380000Zr80.233000,"(Co, Fe, Zr)"
4,Co7.157000Fe11.308000Zr81.535000,"(Co, Fe, Zr)"
...,...,...
5165,Fe58.065000Ti29.314000Nb12.621000,"(Fe, Ti, Nb)"
5166,Fe58.032000Ti28.646000Nb13.321000,"(Fe, Ti, Nb)"
5167,Fe57.965000Ti27.980000Nb14.056000,"(Fe, Ti, Nb)"
5168,Fe57.861000Ti27.313000Nb14.826000,"(Fe, Ti, Nb)"


In [53]:
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf

In [54]:
f =  MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie")])

In [55]:
X_matminer = f.featurize_many(composition_df['composition'])

MultipleFeaturizer:   0%|          | 0/5170 [00:00<?, ?it/s]

In [56]:
X_matminer=pd.DataFrame(X_matminer, columns=f.feature_labels())

In [57]:
X_matminer

Unnamed: 0,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,3,0.778987,0.760484,0.757799,0.757732,0.757730,26.0,40.0,14.0,36.688620,...,2.110663,0.466150,0.706432,0.000000,194.0,229.0,35.0,199.665450,9.496767,194.0
1,3,0.791491,0.775623,0.773591,0.773551,0.773550,26.0,40.0,14.0,36.908010,...,2.110663,0.433934,0.671340,0.000000,194.0,229.0,35.0,199.184900,8.833618,194.0
2,3,0.803560,0.789957,0.788415,0.788391,0.788390,26.0,40.0,14.0,37.113590,...,2.110663,0.403838,0.636763,0.000000,194.0,229.0,35.0,198.741800,8.198762,194.0
3,3,0.815179,0.803519,0.802345,0.802330,0.802330,26.0,40.0,14.0,37.306490,...,2.110663,0.375686,0.602848,0.000000,194.0,229.0,35.0,198.333000,7.593149,194.0
4,3,0.826260,0.816258,0.815359,0.815350,0.815350,26.0,40.0,14.0,37.486470,...,2.110663,0.349498,0.569926,0.000000,194.0,229.0,35.0,197.957800,7.020504,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5165,3,0.662582,0.606385,0.584464,0.581343,0.580712,22.0,41.0,19.0,26.720590,...,2.110663,1.225563,1.027866,2.110663,194.0,229.0,35.0,218.740100,14.504626,229.0
5166,3,0.660745,0.604879,0.583760,0.580919,0.580376,22.0,41.0,19.0,26.852319,...,2.110663,1.224879,1.028087,2.110663,194.0,229.0,35.0,218.973800,14.308132,229.0
5167,3,0.658810,0.603163,0.582746,0.580153,0.579684,22.0,41.0,19.0,26.989190,...,2.110663,1.223440,1.028547,2.110663,194.0,229.0,35.0,219.207098,14.105751,229.0
5168,3,0.656788,0.601245,0.581423,0.579047,0.578642,22.0,41.0,19.0,27.131380,...,2.110663,1.221257,1.029238,2.110663,194.0,229.0,35.0,219.440450,13.897100,229.0


In [58]:
X_matminer.to_csv("matminer_X.csv", index=None)

In [59]:
sys_composition_features_df.to_csv("sys_composition_features_df.csv", index=None)

In [60]:
formula_composition_features_df.to_csv("formula_composition_features_df.csv", index=None)

In [61]:
data.to_csv("glass_ternary_hipt.csv", index=None)

## brgoch_superhard_training

In [199]:
data = load_dataset("brgoch_superhard_training")


Decoding objects from /opt/netapps/anaconda/2019.10/lib/python3.7/site-packages/matminer/datasets/brgoch_superhard_training.json.gz: 100%|##########| 5114/5114 [3:22:18<00:00,  2.37s/it]
Reading file /opt/netapps/anaconda/2019.10/lib/python3.7/site-packages/matminer/datasets/brgoch_superhard_training.json.gz: 5114it [00:02, 2005.10it/s] 18/5114 [00:02<00:00, 3231.74it/s]


In [200]:
df = data
df

Unnamed: 0,formula,bulk_modulus,shear_modulus,composition,material_id,structure,brgoch_feats,suspect_value
0,AlPt3,225.230461,91.197748,"(Al, Pt)",mp-188,"[[0. 0. 0.] Al, [0. 1.96140395 1.96140...","{'atomic_number_feat_1': 123.5, 'atomic_number...",False
1,Mn2Nb,232.696340,74.590157,"(Mn, Nb)",mp-12659,[[-2.23765223e-08 1.42974191e+00 5.92614104e...,"{'atomic_number_feat_1': 45.5, 'atomic_number_...",False
2,HfO2,204.573433,98.564374,"(Hf, O)",mp-352,"[[2.24450185 3.85793022 4.83390736] O, [2.7788...","{'atomic_number_feat_1': 44.0, 'atomic_number_...",False
3,Cu3Pt,159.312640,51.778816,"(Cu, Pt)",mp-12086,"[[0. 1.86144248 1.86144248] Cu, [1.861...","{'atomic_number_feat_1': 82.5, 'atomic_number_...",False
4,Mg3Pt,69.637565,27.588765,"(Mg, Pt)",mp-18707,"[[0. 0. 2.73626461] Mg, [0. ...","{'atomic_number_feat_1': 57.0, 'atomic_number_...",False
...,...,...,...,...,...,...,...,...
2569,MgAsPt5,181.857661,70.362793,"(Mg, As, Pt)",mp-865146,"[[1.99379423 1.99379423 0. ] Mg, [0. ...","{'atomic_number_feat_1': 145.0, 'atomic_number...",False
2570,YCBr,22.970769,9.733656,"(Y, C, Br)",mp-643367,"[[0.66584097 1.92338676 8.4823332 ] Y, [-0.645...","{'atomic_number_feat_1': 26.6666666666667, 'at...",False
2571,NaTiO2,98.358957,70.565034,"(Na, Ti, O)",mp-7002,"[[ 3.31024633 2.00832854 11.99124842] O, [1.0...","{'atomic_number_feat_1': 16.3333333333333, 'at...",False
2572,KCuSe,34.111565,21.586576,"(K, Cu, Se)",mp-7435,"[[0. 0. 0.] K, [0. 0. 4.903557...","{'atomic_number_feat_1': 27.3333333333333, 'at...",False


In [201]:
#df = densityf.featurize_dataframe(df, "structure")

#df = strtc.featurize_dataframe(df, "formula")

#df = f.featurize_dataframe(df,"composition")

In [202]:
df.columns
df

Unnamed: 0,formula,bulk_modulus,shear_modulus,composition,material_id,structure,brgoch_feats,suspect_value
0,AlPt3,225.230461,91.197748,"(Al, Pt)",mp-188,"[[0. 0. 0.] Al, [0. 1.96140395 1.96140...","{'atomic_number_feat_1': 123.5, 'atomic_number...",False
1,Mn2Nb,232.696340,74.590157,"(Mn, Nb)",mp-12659,[[-2.23765223e-08 1.42974191e+00 5.92614104e...,"{'atomic_number_feat_1': 45.5, 'atomic_number_...",False
2,HfO2,204.573433,98.564374,"(Hf, O)",mp-352,"[[2.24450185 3.85793022 4.83390736] O, [2.7788...","{'atomic_number_feat_1': 44.0, 'atomic_number_...",False
3,Cu3Pt,159.312640,51.778816,"(Cu, Pt)",mp-12086,"[[0. 1.86144248 1.86144248] Cu, [1.861...","{'atomic_number_feat_1': 82.5, 'atomic_number_...",False
4,Mg3Pt,69.637565,27.588765,"(Mg, Pt)",mp-18707,"[[0. 0. 2.73626461] Mg, [0. ...","{'atomic_number_feat_1': 57.0, 'atomic_number_...",False
...,...,...,...,...,...,...,...,...
2569,MgAsPt5,181.857661,70.362793,"(Mg, As, Pt)",mp-865146,"[[1.99379423 1.99379423 0. ] Mg, [0. ...","{'atomic_number_feat_1': 145.0, 'atomic_number...",False
2570,YCBr,22.970769,9.733656,"(Y, C, Br)",mp-643367,"[[0.66584097 1.92338676 8.4823332 ] Y, [-0.645...","{'atomic_number_feat_1': 26.6666666666667, 'at...",False
2571,NaTiO2,98.358957,70.565034,"(Na, Ti, O)",mp-7002,"[[ 3.31024633 2.00832854 11.99124842] O, [1.0...","{'atomic_number_feat_1': 16.3333333333333, 'at...",False
2572,KCuSe,34.111565,21.586576,"(K, Cu, Se)",mp-7435,"[[0. 0. 0.] K, [0. 0. 4.903557...","{'atomic_number_feat_1': 27.3333333333333, 'at...",False


In [203]:
m_df = df['brgoch_feats'].apply(pd.Series)
m_df

Unnamed: 0,atomic_number_feat_1,atomic_number_feat_2,atomic_number_feat_3,atomic_number_feat_4,atomic_weight_feat_1,atomic_weight_feat_2,atomic_weight_feat_3,atomic_weight_feat_4,period_number_feat_1,period_number_feat_2,...,inversion_centre,polar_axis,reduced_volume,density,anisotropy,electron_density,volume_per_atom,valence_electron_density,Gilman_electron_density,outer_shell_electron_density
0,123.500000,221.0,13.0,78.0,306.110770,558.258461,26.981539,195.08000,10.500000,15.0,...,1.0,0.0,57.83,17.58,1.000000,0.224799,14.457360,0.899196,0.415014,0.276676
1,45.500000,9.0,25.0,41.0,101.391240,16.969720,54.938050,92.90638,6.500000,3.0,...,1.0,0.0,152.07,8.86,0.771965,0.315737,12.668777,0.947211,0.631474,0.236803
2,44.000000,56.0,8.0,72.0,105.244400,146.491200,15.999400,178.49000,5.000000,2.0,...,1.0,0.0,141.13,9.91,0.976440,0.283521,11.756914,0.850563,0.680451,0.680451
3,82.500000,9.0,29.0,78.0,192.859000,4.442000,63.546000,195.08000,9.000000,6.0,...,1.0,0.0,48.27,13.27,1.000000,0.435072,12.066970,1.740288,0.414354,0.165742
4,57.000000,42.0,12.0,78.0,133.997500,122.165000,24.305000,195.08000,7.500000,3.0,...,0.0,1.0,450.98,5.92,0.966857,0.159631,18.793352,0.638524,0.266052,0.159631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2569,145.000000,378.0,12.0,78.0,358.208863,951.095000,24.305000,195.08000,12.333333,27.0,...,1.0,0.0,105.44,16.93,0.711953,0.161283,15.057811,1.128982,0.664107,0.531286
2570,26.666667,33.0,6.0,39.0,60.273617,76.894850,12.011000,88.90585,3.666667,3.0,...,1.0,0.0,127.63,4.71,0.540494,0.219604,21.250326,0.658813,0.564697,0.611755
2571,16.333333,6.0,8.0,22.0,34.289523,15.881200,15.999400,47.88000,3.666667,0.0,...,1.0,0.0,43.73,3.91,0.458680,0.251784,10.922076,1.007135,0.824019,0.824019
2572,27.333333,15.0,19.0,34.0,60.534767,39.861700,39.098300,78.96000,4.000000,0.0,...,1.0,0.0,144.35,4.18,0.625404,0.249496,24.048492,0.748488,0.374244,0.332661


In [204]:
import pandas as pd
from pymatgen.core.composition import Composition

# Assuming 'df' is your DataFrame and 'composition' is the class you want to encode

# Extract unique elements from the 'composition' class
unique_elements = set()
for comp in df['composition']:
    composition = Composition(comp)  # Convert the composition string to a Composition object
    elements = composition.elements
    unique_elements.update(elements)

unique_elements = sorted(list(unique_elements))

# Create a new DataFrame with one-hot encoding
encoded_df = m_df.copy()
for element in unique_elements:
    encoded_df[element] = df['composition'].apply(lambda x: element in Composition(x).elements).astype(int)

In [205]:
encoded_df.columns#
encoded_df

Unnamed: 0,atomic_number_feat_1,atomic_number_feat_2,atomic_number_feat_3,atomic_number_feat_4,atomic_weight_feat_1,atomic_weight_feat_2,atomic_weight_feat_3,atomic_weight_feat_4,period_number_feat_1,period_number_feat_2,...,Au,C,Se,S,I,Br,N,Cl,O,F
0,123.500000,221.0,13.0,78.0,306.110770,558.258461,26.981539,195.08000,10.500000,15.0,...,0,0,0,0,0,0,0,0,0,0
1,45.500000,9.0,25.0,41.0,101.391240,16.969720,54.938050,92.90638,6.500000,3.0,...,0,0,0,0,0,0,0,0,0,0
2,44.000000,56.0,8.0,72.0,105.244400,146.491200,15.999400,178.49000,5.000000,2.0,...,0,0,0,0,0,0,0,0,1,0
3,82.500000,9.0,29.0,78.0,192.859000,4.442000,63.546000,195.08000,9.000000,6.0,...,0,0,0,0,0,0,0,0,0,0
4,57.000000,42.0,12.0,78.0,133.997500,122.165000,24.305000,195.08000,7.500000,3.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2569,145.000000,378.0,12.0,78.0,358.208863,951.095000,24.305000,195.08000,12.333333,27.0,...,0,0,0,0,0,0,0,0,0,0
2570,26.666667,33.0,6.0,39.0,60.273617,76.894850,12.011000,88.90585,3.666667,3.0,...,0,1,0,0,0,1,0,0,0,0
2571,16.333333,6.0,8.0,22.0,34.289523,15.881200,15.999400,47.88000,3.666667,0.0,...,0,0,0,0,0,0,0,0,1,0
2572,27.333333,15.0,19.0,34.0,60.534767,39.861700,39.098300,78.96000,4.000000,0.0,...,0,0,1,0,0,0,0,0,0,0


In [206]:
encoded_df.to_csv("matminer_bsd_X.csv", index=None)

In [207]:
df.to_csv("brgoch_superhard_training.csv")

Decoding objects from /opt/netapps/anaconda/2019.10/lib/python3.7/site-packages/matminer/datasets/brgoch_superhard_training.json.gz: 100%|##########| 5114/5114 [00:17<00:00, 3231.74it/s]