# Combine and process data
1. Import combined data
2. apply processing functions

In [1]:
import os
import re
import pandas as pd
import numpy as np
import pymatgen as mg

def categorize_phases(row):
    
    if row['PROPERTY: Type of phases'] == 'BCC':
        val = 'BCC'
    elif row['PROPERTY: Type of phases'] == 'FCC':
        val = 'FCC'
    else:
        val = 'other'
    return val


def remove_uncertainty(cell):
    
    val = str(cell)
    
    if '$\pm$' in str(cell):
        val = cell.split('$\pm$')[0]
    if '>' in val:
        val = val.replace('>', '')
    if '<' in val:
        val = val.replace('<', '')
    if 'nan' in val:
        val = ''
        
    return val


def average_range(cell):
    
    val = str(cell).replace(" ", "")
    
    if re.findall(r"\b\d+-\d+\b", val):
        data = [float(x) for x in val.split('-')]
        val = np.average(data)
        
    return val


def neg_elongation(row):
    if row['PROPERTY: Type of test'] == "C" and row['PROPERTY: Elongation (%)'] != "":
        val = -float(row['PROPERTY: Elongation (%)'])
    else:
        val = row['PROPERTY: Elongation (%)']
    return val


def normalize_and_alphabetize_formula(formula):
    if formula:
        try:
            comp = mg.Composition(formula)
            weights = [comp.get_atomic_fraction(ele) for ele in comp.elements]
            normalized_weights = [round(w/max(weights), 3) for w in weights]
            normalized_comp = "".join([str(x)+str(y) for x,y in zip(comp.elements, normalized_weights)])
            
            return mg.Composition(normalized_comp).alphabetical_formula
        except:
            print("INVALID: ", formula)
            return None
    else:
        return None
    
def standardize_synthesis_method(synth_method):
    return synth_method.replace(" ", "")


def compute_density(formula):
    
    comp = mg.Composition(formula)
    weights = [comp.get_atomic_fraction(e)for e in comp.elements]
    values = [e.density_of_solid*10**-3 for e in comp.elements]
    avg = np.average(values, weights=weights)
    
    return round(avg, 1)


def compute_youngs_modulus(formula):
    
    comp = mg.Composition(formula)
    weights = [comp.get_atomic_fraction(e)for e in comp.elements]
    values = [e.youngs_modulus for e in comp.elements]
    if None in values:
        return ''
    avg = np.average(values, weights=weights)
    return int(round(avg, 0))


    
def classify_processing_method(process_method):
    
    if process_method == 'AC' or process_method == 'DC':
        return 'CAST'
    
    if 'SPD' in process_method or 'AM' in process_method or 'HIP' in process_method:
        return 'OTHER'
    
    if 'GA' in process_method or 'MA' in process_method or 'SPS' in process_method or 'VHP' in process_method or process_method == 'S':
        return 'POWDER'
    
    if 'CR' in process_method or 'HR' in process_method or 'HF' in process_method:
        return 'WROUGHT'
    
    if process_method == 'A' or process_method == 'H' or '+A' in process_method or '+H' in process_method or 'Aged' in process_method:
        return 'ANNEAL'


def classify_microstructure(phases):
    
    valid_phases = ['FCC', 'BCC', 'HCP', 'L12', 'B2', 'Laves', 'Laves (C14)', 'Laves (C15)']
    
    if phases == '':
        return ''
    
    phase_list = phases.split('+')
    
    if len(phase_list) == 1 and phase_list[0] in valid_phases:
        return phases
    elif len(phase_list) == 1 and phase_list[0] not in valid_phases:
        return 'Other'
    
    if len(phase_list) > 1 and len(set(phase_list)) == 1 and phase_list[0] in valid_phases:
        return phases
    elif len(phase_list) > 1 and len(set(phase_list)) == 1 and phase_list[0] not in valid_phases:
        return 'Other'
    
    if len(phase_list) > 1 and len(set(phase_list)) > 1 and phase_list[0] in valid_phases:
        
        # if all phases are a subset of valid phases, return all phases
        if set(phase_list).issubset(set(valid_phases)):
            return phases
        else:
            sub_valid = []
            for phase in phase_list:
                if phase in valid_phases:
                    sub_valid.append(phase)
            
            # return valid phases + invalid labeled as "Sec."
            if len(sub_valid) > 0:    
                return '+'.join(sub_valid)+"+Sec."
           
            # no valid phases
            else:
                return 'Other'

In [2]:
df = pd.read_csv('combined_data.csv')
df_refs = pd.read_csv('references/references.csv')
df = df.merge(df_refs, on='IDENTIFIER: Reference ID', how='left')
df['REFERENCE: doi'] = df['REFERENCE: doi_y']

In [3]:
# remove uncertainties
df = df.applymap(remove_uncertainty)
df['PROPERTY: grain size ($\mu$m)'] = df['PROPERTY: grain size ($\mu$m)'].apply(average_range)
df['PROPERTY: Elongation (%)'] = df['PROPERTY: Elongation (%)'].apply(average_range)

In [4]:
# process columns to ensure data types are accurate
numeric_props = ['PROPERTY: grain size ($\mu$m)', 'PROPERTY: ROM Density (g/cm$^3$)', 'PROPERTY: HV', 'PROPERTY: Test temperature ($^\circ$C)',
                'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 'PROPERTY: Elongation (%)', 'PROPERTY: Exp. Young modulus (GPa)']

df[numeric_props] = df[numeric_props].apply(pd.to_numeric)

In [5]:
# compute props
df['PROPERTY: Computed Density (g/cm$^3$)'] = df['FORMULA'].apply(compute_density)
df['PROPERTY: Computed Young modulus (GPa)'] = df['FORMULA'].apply(compute_youngs_modulus)

In [6]:
# ensure formulas are valid
df['FORMULA'] = df['FORMULA'].apply(normalize_and_alphabetize_formula)

# add classifiers
df['PROPERTY: BCC/FCC/other'] = df.apply(categorize_phases, axis=1)
df['PROPERTY: Processing method'] = df['PROPERTY: synthesis method'].apply(standardize_synthesis_method)
df['PROPERTY: Processing method'] = df['PROPERTY: Processing method'].apply(classify_processing_method)
df['PROPERTY: Microstructure'] = df['PROPERTY: Type of phases'].apply(classify_microstructure)
df.to_csv(os.path.abspath('stats_and_tables/MPEA_dataset_for_stats.csv'), index=False)
df

Unnamed: 0.1,Unnamed: 0,REFERENCE: tag,REFERENCE: doi_x,FORMULA,PROPERTY: Type of phases,PROPERTY: Single/Multiphase,PROPERTY: synthesis method,PROPERTY: grain size ($\mu$m),PROPERTY: ROM Density (g/cm$^3$),PROPERTY: HV,...,REFERENCE: title,REFERENCE: year,REFERENCE: doi_y,REFERENCE: url,REFERENCE: doi,PROPERTY: Computed Density (g/cm$^3$),PROPERTY: Computed Young modulus (GPa),PROPERTY: BCC/FCC/other,PROPERTY: Processing method,PROPERTY: Microstructure
0,0,4,10.1016/j.jmmm.2014.07.023,Al0.25 Co1 Fe1 Ni1,FCC,S,AC,,7.9,138.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,8.1,196,FCC,CAST,FCC
1,1,4,10.1016/j.jmmm.2014.07.023,Al0.5 Co1 Fe1 Ni1,FCC+BCC,M,AC,,7.4,212.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,7.7,187,other,CAST,FCC+BCC
2,2,4,10.1016/j.jmmm.2014.07.023,Al0.75 Co1 Fe1 Ni1,FCC+BCC,M,AC,,7.0,385.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,7.4,179,other,CAST,FCC+BCC
3,3,4,10.1016/j.jmmm.2014.07.023,Al1 Co1 Fe1 Ni1,BCC,S,AC,,6.6,456.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,7.1,172,BCC,CAST,BCC
4,4,4,10.1016/j.jmmm.2014.07.023,Co1 Fe1 Ni1,FCC,S,AC,,8.5,125.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,8.6,207,FCC,CAST,FCC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1604,955,J135,10.1080/02670836.2018.1446267,Al1 Nb1 Ti1 Zr1,B2 + Zr5Al3,M,A,,,418.0,...,Microstructure and mechanical properties of a ...,2018,10.1080/02670836.2018.1446267,http://dx.doi.org/10.1080/02670836.2018.1446267,10.1080/02670836.2018.1446267,5.6,90,other,ANNEAL,
1605,956,J135-18,10.1016/j.matlet.2015.08.099,Al1 Nb1 Ti1 V1 Zr0.5,,,,,,,...,An AlNbTiVZr0.5 high-entropy alloy combining h...,2015,10.1016/j.matlet.2015.08.099,http://dx.doi.org/10.1016/j.matlet.2015.08.099,10.1016/j.matlet.2015.08.099,5.6,101,other,,
1606,957,J135-19,10.1179/1743284715Y.0000000032,Al1 Nb1 Ti1 Zr1,,,,,,539.6,...,Effect of Al on structure and mechanical prope...,2015,10.1179/1743284715Y.0000000032,http://dx.doi.org/10.1179/1743284715y.0000000032,10.1179/1743284715Y.0000000032,5.6,90,other,,
1607,958,J135-24,10.1016/j.actamat.2012.11.032,Nb1 Ti1 V1 Zr1,,,,,,335.0,...,"Low-density, refractory multi-principal elemen...",2013,10.1016/j.actamat.2012.11.032,http://dx.doi.org/10.1016/j.actamat.2012.11.032,10.1016/j.actamat.2012.11.032,6.4,104,other,,


In [7]:
cols = ['IDENTIFIER: Reference ID', 'FORMULA', 'PROPERTY: Microstructure', 'PROPERTY: Processing method', 'PROPERTY: BCC/FCC/other', 'PROPERTY: grain size ($\\mu$m)',
        'PROPERTY: Exp. Density (g/cm$^3$)', 'PROPERTY: Computed Density (g/cm$^3$)', 'PROPERTY: HV', 'PROPERTY: Type of test', 'PROPERTY: Test temperature ($^\\circ$C)', 'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 
        'PROPERTY: Elongation (%)', 'PROPERTY: Elongation plastic (%)', 'PROPERTY: Exp. Young modulus (GPa)', 'PROPERTY: Computed Young modulus (GPa)', 'PROPERTY: O content (wppm)', 'PROPERTY: N content (wppm)', 
        'PROPERTY: C content (wppm)', 'REFERENCE: doi', 'REFERENCE: year', 'REFERENCE: title']

df_output = df[cols]
df_output.to_csv(os.path.abspath('MPEA_dataset.csv'), index=False)