# This notebook helps identify potential duplicate data recorded in the database.

In [1]:
import json
import pandas as pd


df = pd.read_csv('../combined_data.csv')
df_ys = df.dropna(subset=['PROPERTY: YS (MPa)'])

gb_key = ['FORMULA', 'REFERENCE: doi', 'PROPERTY: synthesis method', 'PROPERTY: Test temperature ($^\circ$C)']
samples = df_ys.groupby(gb_key)
for s in samples:
    temps = [t for t in s[1]['PROPERTY: Test temperature ($^\circ$C)']]
    ys = [t for t in s[1]['PROPERTY: YS (MPa)']]

    if len(s[1]) > 1:
        print(s[0], len(s[1]), temps, ys)

('(CoNi)80Cr20', '10.1016/j.actamat.2019.04.017', 'CR + A', 25.0) 4 [25.0, 25.0, 25.0, 25.0] ['745', '623', '387', '330']
('Al0.1CoCrFeNi', '10.1016/j.actamat.2018.12.012', 'CR + A', 25.0) 3 [25.0, 25.0, 25.0] ['1403 $\\pm$ 20', '711 $\\pm$ 40', '356 $\\pm$ 13']
('Al0.3CoCrFeNi', '10.1016/j.actamat.2018.12.010', 'CR+A', 25.0) 2 [25.0, 25.0] ['840 $\\pm$ 20', '159 $\\pm$ 22']
('Al0.3CoCrFeNi', '10.1016/j.scriptamat.2018.10.023', 'CR + A', 25.0) 4 [25.0, 25.0, 25.0, 25.0] ['471', '841', '161', '187']
('Al10Co25Cr8Fe15Ni36Ti6', '10.1007/s11837-015-1484-7', 'A', 25.0) 2 [25.0, 25.0] ['568', '596']
('Al10Co25Cr8Fe15Ni36Ti6', '10.1007/s11837-015-1484-7', 'A', 700.0) 2 [700.0, 700.0] ['487', '486']
('Al10Cr12Fe35Mn23Ni20', '10.1016/j.msea.2019.05.056', 'CR', 25.0) 2 [25.0, 25.0] ['320', '1400']
('Al12Co18Cr18Fe35Ni18', '10.1016/j.scriptamat.2016.04.014', 'AC', 25.0) 2 [25.0, 25.0] ['866', '1166']
('Al4Co19Cr19Cu4Fe19Ni37', '10.1016/j.msea.2017.04.111', 'CR + A', 25.0) 2 [25.0, 25.0] ['719', '

In [2]:
df = pd.read_csv('../combined_data.csv')

samples = df.groupby(['REFERENCE: doi','PROPERTY: YS (MPa)'])
for s in samples:
    temps = [t for t in s[1]['PROPERTY: Test temperature ($^\circ$C)']]
    ys = [t for t in s[1]['PROPERTY: YS (MPa)']]

    if len(s[1]['REFERENCE: tag'].unique())>1:
        # print(s[0], len(s[1]), temps, ys)
        print(s[0], s[1]['REFERENCE: tag'].values)


In [3]:
df = pd.read_csv('../combined_data.csv')

samples = df.groupby(['FORMULA', 'PROPERTY: Test temperature ($^\circ$C)','PROPERTY: YS (MPa)'])
for s in samples:
    temps = [t for t in s[1]['PROPERTY: Test temperature ($^\circ$C)']]
    ys = [t for t in s[1]['PROPERTY: YS (MPa)']]

    if len(s[1]['REFERENCE: tag'].unique())>1:
        # print(s[0], len(s[1]), temps, ys)
        print(s[0], s[1]['REFERENCE: tag'].values)

('AlNbTiV', 25.0, '1000.0') ['Couzinie_14' 'Couzinie_9']
('AlNbTiV', 600.0, '780.0') ['Couzinie_14' 'Couzinie_9']
('AlNbTiV', 800.0, '560.0') ['Couzinie_14' 'Couzinie_9']
('CoCrCuFeNiTi0.5', 25.0, '700.0') ['25' '35']
('HfMoNbTaTiZr', 25.0, '1512.0') ['62' '63']
('HfMoNbTaTiZr', 1000.0, '814') ['J38' 'J131']
('HfMoNbTaTiZr', 1200.0, '556') ['J38' 'J131']
('HfMoTaTiZr', 1000.0, '855') ['J38' 'J131']
('HfMoTaTiZr', 1200.0, '404') ['J38' 'J131']
('HfNbTaTiZr', 25.0, '940.0') ['Couzinie_34' 'Couzinie_35']


In [4]:
df = pd.read_csv('../combined_data.csv')

samples = df.groupby(['REFERENCE: doi'])
for s in samples:
    temps = [t for t in s[1]['PROPERTY: Test temperature ($^\circ$C)']]
    ys = [t for t in s[1]['PROPERTY: YS (MPa)']]

    if len(s[1]['REFERENCE: tag'].unique())>1:
        # print(s[0], len(s[1]), temps, ys)
        print(s[0], s[1]['REFERENCE: tag'].values)

In [17]:
out = 'PROPERTY: HV'

outs = ['PROPERTY: Exp. Density (g/cm$^3$)',
       'PROPERTY: Calculated Density (g/cm$^3$)', 'PROPERTY: HV',
       'PROPERTY: YS (MPa)', 'PROPERTY: UTS (MPa)', 'PROPERTY: Elongation (%)',
       'PROPERTY: Elongation plastic (%)',
       'PROPERTY: Exp. Young modulus (GPa)',
       'PROPERTY: Calculated Young modulus (GPa)']

input_df = pd.read_csv('../MPEA_dataset.csv')


for out in outs:
    df = input_df.dropna(subset=[out])
    grouped = df.groupby('FORMULA')

    rows = []
    for g in grouped:
        if len(g[1][out]) > 1:
            row = [g[0], round(g[1][out].std())]
            rows.append(row)

    df_std = pd.DataFrame(data=rows, columns=['FORMULA', '{}_std'.format(out)])
    df_std = df_std.sort_values('{}_std'.format(out), ascending=False)
    df_std.to_csv('outliers/{}_std.csv'.format(out.split('(')[0]))