In [205]:
import pandas as pd
import numpy as np
import sklearn as sk
from urllib.request import urlopen
import json   
from matplotlib import pyplot

### Getting training data

In [242]:
MP_json_dir = "../data/training/materialsproject_json"

training_compounds = [
    "Si",
    "Ge",
    "SiGe",
]

In [243]:
training_data = dict()
for training_compound in training_compounds:
    with open(f"{MP_json_dir}/{training_compound}.json") as fname:
        training_compound_results = dict(json.load(fname))
        for ID, training_compound_result in training_compound_results.items():
            training_data[ID] = training_compound_result

In [244]:
training_data_IDs = list(training_data.keys())

In [245]:
training_data[training_data_IDs[-1]]

{'formula': 'Si3Ge',
 'spacegroup': 'Fm3m',
 'formation_energy__eV': 0.525,
 'E_above_hull__eV': 0.525,
 'band_gap__eV': 0.0,
 'has_bandstructure': True,
 'volume': 63.538999999999994,
 'Nsites': 4,
 'theoretical': True,
 'count': nan,
 'density__gm_per_cc': 4.1,
 'crystal_system': 'cubic'}

### Extract stoichiometry for each result

In [251]:
def get_stoichiomertry(formula):
    symbols = []
    stoichiometry = dict()
    for ii in range(len(formula)-1):
        item_ii = formula[ii]
        item_jj = formula[ii+1]
        if item_ii.isupper() and item_jj.islower():
            symbols.append((ii,formula[ii:ii+2]))
                        
    for ii in range(len(symbols)):
        start_slc = symbols[ii][0]+2
        if ii < len(symbols)-1:
            end_slc = symbols[ii+1][0]
            tmp_value = formula[start_slc:symbols[ii+1][0]]
        else:
            tmp_value = formula[start_slc:]
        stoichiometry[symbols[ii][1]] = int(tmp_value) if len(tmp_value) != 0 else 1
        
    return stoichiometry

def get_norm_stoichiomertry(formula):
    stoichiometry = get_stoichiomertry(formula)
    num_atoms = [ atoms for (element, atoms) in stoichiometry.items() ]
    num_atoms_total = sum(num_atoms)
    norm_stoichiometry = { element:atoms/num_atoms_total for (element, atoms) in stoichiometry.items()} 
    return norm_stoichiometry

In [252]:
for ID, result in training_data.items():
    norm_stoichiometry = get_norm_stoichiomertry(result["formula"])
    training_data[ID]["stoichiometry"] = norm_stoichiometry

In [254]:
training_data[training_data_IDs[-1]]

{'formula': 'Si3Ge',
 'spacegroup': 'Fm3m',
 'formation_energy__eV': 0.525,
 'E_above_hull__eV': 0.525,
 'band_gap__eV': 0.0,
 'has_bandstructure': True,
 'volume': 63.538999999999994,
 'Nsites': 4,
 'theoretical': True,
 'count': nan,
 'density__gm_per_cc': 4.1,
 'crystal_system': 'cubic',
 'stoichiometry': {'Si': 0.75, 'Ge': 0.25}}

In [255]:
# use period table from exabyte.io github:
periodic_table_url = urlopen("https://raw.githubusercontent.com/Exabyte-io/periodic-table.js/master/periodic-table.json")
periodic_table = json.loads(periodic_table_url.read())

symbol_to_element_map = { details["symbol"]:element for (element, details) in periodic_table.items()}

In [264]:
sample = training_data[training_data_IDs[-1]]
sample_atoms = list(sample["stoichiometry"].keys())
print(sample_atoms)
sample_atoms_elements = [ symbol_to_element_map[sample_atom] for sample_atom in sample_atoms ]
print(sample_atoms_elements)

['Si', 'Ge']
['Silicon', 'Germanium']


In [267]:
periodic_table[sample_atoms_elements[0]]

{'symbol': 'Si',
 'atomic_number': 14,
 'atomic_weight': 28.0855,
 'density_g_per_cm3': 2.33,
 'melting_point_K': 1683,
 'boiling_point_K': 2628,
 'atomic_radius_pm': 132,
 'covalent_radius_pm': 111,
 'ionic_radius_pm': '',
 'atomic_volume_cm3_per_mol': 12.1,
 'specific_heat_J_g_mol': 0.703,
 'fusion_heat_kJ_mol': 50.6,
 'evaporation_heat_kJ_mol': 383,
 'thermal_conductivity_25C_W_m_K': 149,
 'pauling_negativity': 1.9,
 'first_ionizing_kJ_mol': 786,
 'oxidation_states': '4, -4',
 'electronic_configuration': '[Ne]3s²3p²',
 'lattice_structure': 'DIA',
 'lattice_constant_ang': 5.43}

In [34]:
# change crystal system to an identification number
crystal_systems = [
    "triclinic",
    "monoclinic",
    "orthorhombic",
    "tetragonal",
    "hexagonal",
    "trigonal",
    "cubic",
]
crystal_system_mapping = dict(zip(crystal_systems, range(0,len(crystal_systems))))