# This notebook takes the combined data and references produced from get_references and performs post-processing of recorded data including:
1. Cleans numerical value strings. 
2. Normalizes recorded compositions.
3. Generates classifiers used for visualizations.
4. Calculates density and Young modulus.

In [1]:
import os
import re
import pandas as pd
import numpy as np
import pymatgen as mg
from utils import *

In [2]:
# import combined data, add refs
df = pd.read_csv('combined_data.csv')
df_refs = pd.read_csv('references/references.csv')
df = df.merge(df_refs, on='IDENTIFIER: Reference ID', how='left')
df['REFERENCE: doi'] = df['REFERENCE: doi_y']
df.head()

Unnamed: 0.1,Unnamed: 0,REFERENCE: tag,REFERENCE: doi_x,FORMULA,PROPERTY: Type of phases,PROPERTY: Single/Multiphase,PROPERTY: synthesis method,PROPERTY: grain size ($\mu$m),PROPERTY: ROM Density (g/cm$^3$),PROPERTY: HV,...,Internal Reference #,Original DOI,PROPERTY: Exp. Density (g/cm$^3$),Unnamed: 22,IDENTIFIER: Reference ID,REFERENCE: title,REFERENCE: year,REFERENCE: doi_y,REFERENCE: url,REFERENCE: doi
0,0,4,10.1016/j.jmmm.2014.07.023,Al0.25CoFeNi,FCC,S,AC,,7.9,138.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
1,1,4,10.1016/j.jmmm.2014.07.023,Al0.5CoFeNi,FCC+BCC,M,AC,,7.4,212.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
2,2,4,10.1016/j.jmmm.2014.07.023,Al0.75CoFeNi,FCC+BCC,M,AC,,7.0,385.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
3,3,4,10.1016/j.jmmm.2014.07.023,AlCoFeNi,BCC,S,AC,,6.6,456.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
4,4,4,10.1016/j.jmmm.2014.07.023,CoFeNi,FCC,S,AC,,8.5,125.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023


In [3]:
# remove uncertainties / clean numerical values
df = df.applymap(remove_uncertainty)
df['PROPERTY: grain size ($\mu$m)'] = df['PROPERTY: grain size ($\mu$m)'].apply(average_range)
df['PROPERTY: Elongation (%)'] = df['PROPERTY: Elongation (%)'].apply(average_range)

# process columns to ensure data types are accurate
numeric_props = ['PROPERTY: grain size ($\mu$m)', 'PROPERTY: ROM Density (g/cm$^3$)', 'PROPERTY: HV', 'PROPERTY: Test temperature ($^\circ$C)',
                'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 'PROPERTY: Elongation (%)', 'PROPERTY: Exp. Young modulus (GPa)']

df[numeric_props] = df[numeric_props].apply(pd.to_numeric)

df.head()

Unnamed: 0.1,Unnamed: 0,REFERENCE: tag,REFERENCE: doi_x,FORMULA,PROPERTY: Type of phases,PROPERTY: Single/Multiphase,PROPERTY: synthesis method,PROPERTY: grain size ($\mu$m),PROPERTY: ROM Density (g/cm$^3$),PROPERTY: HV,...,Internal Reference #,Original DOI,PROPERTY: Exp. Density (g/cm$^3$),Unnamed: 22,IDENTIFIER: Reference ID,REFERENCE: title,REFERENCE: year,REFERENCE: doi_y,REFERENCE: url,REFERENCE: doi
0,0,4,10.1016/j.jmmm.2014.07.023,Al0.25CoFeNi,FCC,S,AC,,7.9,138.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
1,1,4,10.1016/j.jmmm.2014.07.023,Al0.5CoFeNi,FCC+BCC,M,AC,,7.4,212.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
2,2,4,10.1016/j.jmmm.2014.07.023,Al0.75CoFeNi,FCC+BCC,M,AC,,7.0,385.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
3,3,4,10.1016/j.jmmm.2014.07.023,AlCoFeNi,BCC,S,AC,,6.6,456.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023
4,4,4,10.1016/j.jmmm.2014.07.023,CoFeNi,FCC,S,AC,,8.5,125.0,...,,,,,27,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023


In [4]:
# ensure formulas are valid
df['FORMULA'] = df['FORMULA'].apply(normalize_and_alphabetize_formula)

# add classifiers
df['PROPERTY: BCC/FCC/other'] = df.apply(categorize_phases, axis=1)
df['PROPERTY: Processing method'] = df['PROPERTY: synthesis method'].apply(standardize_synthesis_method)
df['PROPERTY: Processing method'] = df['PROPERTY: Processing method'].apply(classify_processing_method)
df['PROPERTY: Microstructure'] = df['PROPERTY: Type of phases'].apply(classify_microstructure)

# calculate props
df['PROPERTY: Calculated Density (g/cm$^3$)'] = df['FORMULA'].apply(calculate_density)
df['PROPERTY: Calculated Young modulus (GPa)'] = df.apply(lambda x: calculate_youngs_modulus(x['FORMULA']) if x['PROPERTY: BCC/FCC/other'] != 'other' else '', axis=1)

df.to_csv(os.path.abspath('stats_and_tables/MPEA_dataset_for_stats.csv'), index=False)
df.head()

Unnamed: 0.1,Unnamed: 0,REFERENCE: tag,REFERENCE: doi_x,FORMULA,PROPERTY: Type of phases,PROPERTY: Single/Multiphase,PROPERTY: synthesis method,PROPERTY: grain size ($\mu$m),PROPERTY: ROM Density (g/cm$^3$),PROPERTY: HV,...,REFERENCE: title,REFERENCE: year,REFERENCE: doi_y,REFERENCE: url,REFERENCE: doi,PROPERTY: BCC/FCC/other,PROPERTY: Processing method,PROPERTY: Microstructure,PROPERTY: Calculated Density (g/cm$^3$),PROPERTY: Calculated Young modulus (GPa)
0,0,4,10.1016/j.jmmm.2014.07.023,Al0.25 Co1 Fe1 Ni1,FCC,S,AC,,7.9,138.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,FCC,CAST,FCC,7.9,192.0
1,1,4,10.1016/j.jmmm.2014.07.023,Al0.5 Co1 Fe1 Ni1,FCC+BCC,M,AC,,7.4,212.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,other,CAST,FCC+BCC,7.4,
2,2,4,10.1016/j.jmmm.2014.07.023,Al0.75 Co1 Fe1 Ni1,FCC+BCC,M,AC,,7.0,385.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,other,CAST,FCC+BCC,7.0,
3,3,4,10.1016/j.jmmm.2014.07.023,Al1 Co1 Fe1 Ni1,BCC,S,AC,,6.6,456.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,BCC,CAST,BCC,6.6,162.0
4,4,4,10.1016/j.jmmm.2014.07.023,Co1 Fe1 Ni1,FCC,S,AC,,8.5,125.0,...,Effects of Al and Si addition on the structure...,2014,10.1016/j.jmmm.2014.07.023,http://dx.doi.org/10.1016/j.jmmm.2014.07.023,10.1016/j.jmmm.2014.07.023,FCC,CAST,FCC,8.5,207.0


In [5]:
# define columns of interest, output dataset file
cols = ['IDENTIFIER: Reference ID', 'FORMULA', 'PROPERTY: Microstructure', 'PROPERTY: Processing method', 'PROPERTY: BCC/FCC/other', 'PROPERTY: grain size ($\\mu$m)',
        'PROPERTY: Exp. Density (g/cm$^3$)', 'PROPERTY: Calculated Density (g/cm$^3$)', 'PROPERTY: HV', 'PROPERTY: Type of test', 'PROPERTY: Test temperature ($^\\circ$C)', 'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 
        'PROPERTY: Elongation (%)', 'PROPERTY: Elongation plastic (%)', 'PROPERTY: Exp. Young modulus (GPa)', 'PROPERTY: Calculated Young modulus (GPa)', 'PROPERTY: O content (wppm)', 'PROPERTY: N content (wppm)', 
        'PROPERTY: C content (wppm)', 'REFERENCE: doi', 'REFERENCE: year', 'REFERENCE: title']

df_output = df[cols]
df_output.to_csv(os.path.abspath('MPEA_dataset.csv'), index=False)
df_output

Unnamed: 0,IDENTIFIER: Reference ID,FORMULA,PROPERTY: Microstructure,PROPERTY: Processing method,PROPERTY: BCC/FCC/other,PROPERTY: grain size ($\mu$m),PROPERTY: Exp. Density (g/cm$^3$),PROPERTY: Calculated Density (g/cm$^3$),PROPERTY: HV,PROPERTY: Type of test,...,PROPERTY: Elongation (%),PROPERTY: Elongation plastic (%),PROPERTY: Exp. Young modulus (GPa),PROPERTY: Calculated Young modulus (GPa),PROPERTY: O content (wppm),PROPERTY: N content (wppm),PROPERTY: C content (wppm),REFERENCE: doi,REFERENCE: year,REFERENCE: title
0,27,Al0.25 Co1 Fe1 Ni1,FCC,CAST,FCC,,,7.9,138.0,C,...,,,,192,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
1,27,Al0.5 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.4,212.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
2,27,Al0.75 Co1 Fe1 Ni1,FCC+BCC,CAST,other,,,7.0,385.0,C,...,,,,,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
3,27,Al1 Co1 Fe1 Ni1,BCC,CAST,BCC,,,6.6,456.0,C,...,,,,162,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
4,27,Co1 Fe1 Ni1,FCC,CAST,FCC,,,8.5,125.0,C,...,,,,207,,,,10.1016/j.jmmm.2014.07.023,2014,Effects of Al and Si addition on the structure...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,266,Hf1 Nb1 Ta1 Ti1,BCC,CAST,BCC,,,10.9,270.0,,...,,,,119,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."
1546,266,Mo1 Nb1 Ta1 Ti1 W1,BCC,ANNEAL,BCC,,,11.8,482.0,,...,,,,222,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."
1547,266,Mo1 Nb1 Ta1 Ti1 W1,BCC,CAST,BCC,,,11.8,446.0,,...,,,,222,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."
1548,266,Mo1 Nb1 Ta1 Ti1,BCC,ANNEAL,BCC,,,10.0,407.0,,...,,,,179,,,,10.1016/j.matdes.2018.06.003,2018,"Phase equilibria, mechanical properties and de..."
