# This notebook generates stats and tables referenced in the article.

In [1]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv(os.path.abspath('stats_and_tables/MPEA_dataset_for_stats.csv'))

df_2019 = df[df['REFERENCE: tag'].str.contains('J')]
df_new = df[df['REFERENCE: tag'].str.contains('new_ref')]

print('TOTAL records = {}'.format(len(df)))
print('Total records (2018) = {}'.format(len(df) - len(df_2019)-len(df_new)))
print('Total records (2019) = {}'.format(len(df_2019)))
print('Total records new (pre-2019) = {}'.format(len(df_new)))
print('Total records new records = {}'.format(len(df_new)+len(df_2019)))

print('\n')
print('TOTAL unique references: {}'.format(len(df['REFERENCE: doi'].unique())))
print('\n')
print('Total unique compositions = {}'.format(len(df['FORMULA'].unique())))
print('Total unique compositions (2018) = {}'.format(len(df['FORMULA'].unique())-len(df_2019['FORMULA'].unique())-len(df_new['FORMULA'].unique())))
print('Total unique compositions (2019) = {}'.format(len(df_2019['FORMULA'].unique())))
print('Total unique compositions (new pre-2019) = {}'.format(len(df_new['FORMULA'].unique())))
print('Total unique compositions (all new records) = {}'.format(len(df_new['FORMULA'].unique())+len(df_2019['FORMULA'].unique())))

print('\n')
print('TOTAL YS: {}'.format(len(df['PROPERTY: YS (MPa)'].dropna())))
print('TOTAL UTS: {}'.format(len(df['PROPERTY: UTS (MPa)'].dropna())))
print('TOTAL HV: {}'.format(len(df['PROPERTY: HV'].dropna())))
print('TOTAL Elongation: {}'.format(len(df['PROPERTY: Elongation (%)'].dropna())))

TOTAL records = 1545
Total records (2018) = 614
Total records (2019) = 903
Total records new (pre-2019) = 28
Total records new records = 931


TOTAL unique references: 265


Total unique compositions = 630
Total unique compositions (2018) = 296
Total unique compositions (2019) = 317
Total unique compositions (new pre-2019) = 17
Total unique compositions (all new records) = 334


TOTAL YS: 1067
TOTAL UTS: 539
TOTAL HV: 530
TOTAL Elongation: 619


In [2]:
df_mp = pd.read_csv(os.path.abspath('MPEA_dataset.csv'))
df_mp = df_mp[df_mp.duplicated(keep=False)]
#df_mp = df_mp.groupby(list(df_mp)).apply(lambda x: tuple(x.index))
df_mp.to_csv('duplicates.csv')

In [3]:
# db at-a-glance table
props = ['REFERENCE: doi', 'FORMULA', 'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 'PROPERTY: Elongation (%)']

df_table = df[(df['PROPERTY: Test temperature ($^\circ$C)'] >= 20) & (df['PROPERTY: Test temperature ($^\circ$C)'] <= 25)]
df_table = df_table[props].dropna()
df_table = df_table.sort_values('PROPERTY: YS (MPa)', ascending=False)
df_table = df_table.drop_duplicates(subset=['FORMULA'])



# df_table = df_table.sort_values(by='FORMULA')
# df_table['FORMULA'] = df_table['FORMULA'].apply(lambda x: Composition(x).get_integer_formula_and_factor()[0])
df_table = df_table.head(25)

df_table['PROPERTY: YS (MPa)'] = df_table['PROPERTY: YS (MPa)'].round(0).astype(int)
df_table['PROPERTY: UTS (MPa)'] = df_table['PROPERTY: UTS (MPa)'].round(0).astype(int)
df_table['PROPERTY: Elongation (%)'] = df_table['PROPERTY: Elongation (%)'].round(0).astype(int)

# df_table['FORMULA'] = df_table['FORMULA'].apply(lambda x:  '\ce{' + x + '}')
latex_table = df_table.to_latex(index=False, escape=False)
print(latex_table) 
with open('stats_and_tables/prop_table.tex', 'w') as tf:
     tf.write(latex_table)

\begin{tabular}{llrrr}
\toprule
                        REFERENCE: doi &                                      FORMULA &  PROPERTY: YS (MPa) &  PROPERTY: UTS (MPa) &  PROPERTY: Elongation (%) \\
\midrule
                     10.1063/1.2734517 &                    Al1 Co1 Cr1 Fe1 Ni1 Ti0.5 &                2260 &                 3140 &                        24 \\
                     10.1063/1.2734517 &  Al0.667 Co0.667 Cr0.667 Fe0.667 Ni0.667 Ti1 &                2220 &                 2720 &                         7 \\
         10.1016/j.actamat.2016.01.018 &                              Hf1 Nb1 Ta1 Zr1 &                2100 &                 2200 &                         4 \\
             10.1007/s11837-014-1066-0 &   Al0.333 Nb0.667 Ta0.533 Ti1 V0.133 Zr0.667 &                2035 &                 2105 &                         5 \\
          10.1016/j.matdes.2013.04.061 &                      Al0.7 Co0.3 Cr1 Fe1 Ni1 &                2033 &                 2635 &                 

In [4]:
def write_prop_table(df):
    df['Alloy composition'] = df['FORMULA']
    cols = ['Alloy composition', 'PROPERTY: Processing method', 'PROPERTY: Microstructure', 'PROPERTY: grain size ($\mu$m)',
           'PROPERTY: Exp. Density (g/cm$^3$)', 'PROPERTY: Calculated Density (g/cm$^3$)', 'PROPERTY: Test temperature ($^\circ$C)', 
            'PROPERTY: HV', 'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 'PROPERTY: Elongation (%)', 'PROPERTY: Elongation plastic (%)',
           'PROPERTY: Exp. Young modulus (GPa)', 'PROPERTY: Calculated Young modulus (GPa)', 'PROPERTY: O content (wppm)', 'PROPERTY: N content (wppm)', 'PROPERTY: C content (wppm)']
    df = df[cols]
    dft = df.describe(include='all').T
    dft['unique'] = [len(df[key].dropna().unique()) for key in df.keys()]
    dft = dft.drop(['25%', '50%', '75%', 'freq', 'top'], axis=1)
    #dft = dft.drop(['REFERENCE: tag', 'REFERENCE: doi', 'PROPERTY: Single/Multiphase', 'PROPERTY: BCC/FCC/other', 'REFERENCE: comment'])

    dft['mean'] = [np.round(v, 1) if not np.isnan(v) else '-' for v in dft['mean'] ]
    dft['std'] = [np.round(v, 1) if not np.isnan(v) else '-' for v in dft['std']]
    dft['min'] = [np.round(v, 3) if not np.isnan(v) else '-' for v in dft['min']]
    dft['max'] = [np.round(v, 2) if not np.isnan(v) else '-' for v in dft['max']]
    dft['count'] = [int(v) if not np.isnan(v) else '-' for v in dft['count']]
    dft['unique'] = [int(v) if not np.isnan(v) else '-' for v in dft['unique']]


    tex_lines = ['\\begin{tabular}{lccccccc}']
    tex_lines.append('\\toprule')
    header = ['Property']
    header.extend([c for c in dft.columns])
    
    tex_lines.append(' & '.join(header) + ' \\\\')
    tex_lines.append('\\midrule')
    
    print(dft)
    
    
    for i in dft.index:
        
        prop_name = i.replace('%', '\%')
        prop_name = prop_name.replace('PROPERTY: ', '')
        cols = [prop_name]
        cols.extend([str(dft[prop][i]) for prop in dft.keys()])
        tex_lines.append(' & '.join(cols) + ' \\\\')
        
    tex_lines.append('\\bottomrule')
    tex_lines.append('\\end{tabular}')

    tex_file = 'stats_and_tables/prop_stats.tex'
    with open(tex_file, 'w') as fw:
        for line in tex_lines:
            fw.write(line)
            fw.write('\n')

write_prop_table(df)

                                          count  unique     mean      std  \
Alloy composition                          1545     630        -        -   
PROPERTY: Processing method                1426       5        -        -   
PROPERTY: Microstructure                   1402      40        -        -   
PROPERTY: grain size ($\mu$m)               237     176     90.2      183   
PROPERTY: Exp. Density (g/cm$^3$)           112      52      7.6      2.5   
PROPERTY: Calculated Density (g/cm$^3$)    1545      82        8      1.8   
PROPERTY: Test temperature ($^\circ$C)     1364      64      228    379.1   
PROPERTY: HV                                530     372    478.3    212.9   
PROPERTY: YS (MPa)                         1067     713      890    570.1   
PROPERTY: UTS (MPa)                         539     441   1180.1    720.9   
PROPERTY: Elongation (%)                    619     245     30.2       22   
PROPERTY: Elongation plastic (%)            149      85     20.3     25.7   