# Combine and process data
1. Import 2018 and 2019 data
2. Combine and process (see processing functions)

In [1]:
import pandas as pd
import numpy as np
import os
import re
from pymatgen import Composition


def categorize_phases(row):
    
    if row['PROPERTY: Type of phases'] == 'BCC':
        val = 'BCC'
    elif row['PROPERTY: Type of phases'] == 'FCC':
        val = 'FCC'
    else:
        val = 'other'
    return val


def remove_uncertainty(cell):
    
    val = str(cell)
    
    if '$\pm$' in str(cell):
        val = cell.split('$\pm$')[0]
    if '>' in val:
        val = val.replace('>', '')
    if '<' in val:
        val = val.replace('<', '')
    if 'nan' in val:
        val = ''
        
    return val


def average_range(cell):
    
    val = str(cell).replace(" ", "")
    
    if re.findall(r"\b\d+-\d+\b", val):
        data = [float(x) for x in val.split('-')]
        val = np.average(data)
        
    return val


def neg_elongation(row):
    if row['PROPERTY: Type of test'] == "C" and row['PROPERTY: Elongation (%)'] != "":
        val = -float(row['PROPERTY: Elongation (%)'])
    else:
        val = row['PROPERTY: Elongation (%)']
    return val


def normalize_and_alphabetize_formula(formula):
    if formula:
        try:
            comp = Composition(formula)
            weights = [comp.get_atomic_fraction(ele) for ele in comp.elements]
            normalized_weights = [round(w/max(weights), 3) for w in weights]
            normalized_comp = "".join([str(x)+str(y) for x,y in zip(comp.elements, normalized_weights)])
            
            return Composition(normalized_comp).alphabetical_formula
        except:
            print("INVALID: ", formula)
            return None
    else:
        return None
    
def standardize_synthesis_method(synth_method):
    return synth_method.replace(" ", "")


def calc_density_from_ROM(formula):
    comp = Composition(formula)
    p = np.array([ele.density_of_solid for ele in comp.elements])
    weighted_p = p*[comp.get_atomic_fraction(ele) for ele in comp.elements]
    p_sum = weighted_p.sum()*10**-3
    print(formula, p, p_sum)

    return p_sum

def get_ref_year(doi):
    refs = pd.read_csv('~/projects/Schmidt-MPEA/data/database_processing/references.csv')
    year = refs[refs['REFERENCE: doi']==doi]['REFERENCE: year'].to_list()[0]
    return year


def classify_processing_method(process_method):
    
    if process_method == 'AC' or process_method == 'DC':
        return 'CAST'
    
    if 'SPD' in process_method or 'AM' in process_method or 'HIP' in process_method:
        return 'OTHER'
    
    if 'GA' in process_method or 'MA' in process_method or 'SPS' in process_method or 'VHP' in process_method or process_method == 'S':
        return 'POWDER'
    
    if 'CR' in process_method or 'HR' in process_method or 'HF' in process_method:
        return 'WROUGHT'
    
    if process_method == 'A' or process_method == 'H' or '+A' in process_method or '+H' in process_method or 'Aged' in process_method:
        return 'ANNEAL'


def classify_microstructure(phases):
    
    valid_phases = ['FCC', 'BCC', 'HCP', 'L12', 'B2', 'Laves', 'Laves (C14)', 'Laves (C15)']
    
    if phases == '':
        return ''
    
    phase_list = phases.split('+')
    
    if len(phase_list) == 1 and phase_list[0] in valid_phases:
        return phases
    elif len(phase_list) == 1 and phase_list[0] not in valid_phases:
        return 'Other'
    
    if len(phase_list) > 1 and len(set(phase_list)) == 1 and phase_list[0] in valid_phases:
        return phases
    elif len(phase_list) > 1 and len(set(phase_list)) == 1 and phase_list[0] not in valid_phases:
        return 'Other'
    
    if len(phase_list) > 1 and len(set(phase_list)) > 1 and phase_list[0] in valid_phases:
        
        # if all phases are a subset of valid phases, return all phases
        if set(phase_list).issubset(set(valid_phases)):
            return phases
        else:
            sub_valid = []
            for phase in phase_list:
                if phase in valid_phases:
                    sub_valid.append(phase)
            
            # return valid phases + invalid labeled as "Sec."
            if len(sub_valid) > 0:    
                return '+'.join(sub_valid)+"+Sec."
           
            # no valid phases
            else:
                return 'Other'

In [2]:
# import as dataframe
df_2018 = pd.read_csv(os.path.abspath('2018_data/2018_data.csv'))
df_2019 = pd.read_csv(os.path.abspath('2019_data/2019_data.csv'))

# combine
df = pd.concat([df_2018, df_2019])

# reindex references
df['IDENTIFIER: Reference ID'] = df['REFERENCE: tag'].astype('category').cat.codes.apply(lambda x: x+1)

Index(['REFERENCE: tag', 'REFERENCE: doi', 'FORMULA',
       'PROPERTY: Type of phases', 'PROPERTY: Single/Multiphase',
       'PROPERTY: synthesis method', 'PROPERTY: grain size ($\mu$m)',
       'PROPERTY: ROM Density (g/cm$^3$)', 'PROPERTY: HV',
       'PROPERTY: Type of test', 'PROPERTY: Test temperature ($^\circ$C)',
       'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 'PROPERTY: Elongation (%)',
       'PROPERTY: Elongation plastic (%)', 'PROPERTY: Young modulus (GPa)',
       'PROPERTY: O content (wppm)', 'PROPERTY: N content (wppm)',
       'PROPERTY: C content (wppm)', 'REFERENCE: comment',
       'Internal Reference # (J4, J90)', 'J4, J90 Original DOI',
       'PROPERTY: Exp. Density (g/cm$^3$)', 'Unnamed: 22',
       'IDENTIFIER: Reference ID'],
      dtype='object')


In [3]:
# remove uncertainties
df = df.applymap(remove_uncertainty)
df['PROPERTY: grain size ($\mu$m)'] = df['PROPERTY: grain size ($\mu$m)'].apply(average_range)
df['PROPERTY: Elongation (%)'] = df['PROPERTY: Elongation (%)'].apply(average_range)

In [4]:
# process columns to ensure data types are accurate
numeric_props = ['PROPERTY: grain size ($\mu$m)', 'PROPERTY: ROM Density (g/cm$^3$)', 'PROPERTY: HV', 'PROPERTY: Test temperature ($^\circ$C)',
                'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 'PROPERTY: Elongation (%)', 'PROPERTY: Young modulus (GPa)']

df[numeric_props] = df[numeric_props].apply(pd.to_numeric)

In [5]:
# ensure formulas are valid
df['FORMULA'] = df['FORMULA'].apply(normalize_and_alphabetize_formula)

# add classifiers
df['PROPERTY: BCC/FCC/other'] = df.apply(categorize_phases, axis=1)
df['PROPERTY: Processing method'] = df['PROPERTY: synthesis method'].apply(standardize_synthesis_method)
df['PROPERTY: Processing method'] = df['PROPERTY: Processing method'].apply(classify_processing_method)
df['PROPERTY: Microstructure'] = df['PROPERTY: Type of phases'].apply(classify_microstructure)
df['REFERENCE: year'] = df['REFERENCE: doi'].apply(get_ref_year)
df.to_csv(os.path.abspath('MPEA_dataset_for_stats.csv'), index=False)

In [6]:
cols = ['IDENTIFIER: Reference ID', 'FORMULA', 'PROPERTY: Microstructure', 'PROPERTY: Processing method', 'PROPERTY: grain size ($\\mu$m)', 'PROPERTY: ROM Density (g/cm$^3$)',
        'PROPERTY: Exp. Density (g/cm$^3$)', 'PROPERTY: HV', 'PROPERTY: Type of test', 'PROPERTY: Test temperature ($^\\circ$C)', 'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 
        'PROPERTY: Elongation (%)', 'PROPERTY: Elongation plastic (%)', 'PROPERTY: Young modulus (GPa)', 'PROPERTY: O content (wppm)', 'PROPERTY: N content (wppm)', 
        'PROPERTY: C content (wppm)', 'REFERENCE: doi', 'REFERENCE: year']

df_output = df[cols]
df_output.to_csv(os.path.abspath('MPEA_dataset.csv'), index=False)

In [7]:
# output_refs
cols = ['IDENTIFIER: Reference ID', 'REFERENCE: doi']
ref_df = df[cols]
ref_df.drop_duplicates('IDENTIFIER: Reference ID').sort_values('IDENTIFIER: Reference ID').to_csv('references.csv', index=False)