In [2]:
import pandas as pd

base_dir = 'data/'

# rawdata queried from starrydata without processing
fp_raw = 'data/rawdata.csv.gz'

# cache after first processing step of combining data+metadata and interpolating data points at room temp (300K)
fp_interpolated = 'rawdata_interpolated.csv'

# cache after including calculated properties (e.g. simga E0) and additional calculated values for extracted properties (e.g. ZT)
fp_calc = 'rawdata_interpolated_calc.csv'

# cache after applying physical bounds / filters on properties of interest and including composition-based classifier
fp_final = 'rawdata_interpolated_filtered_with_classifiers.csv'

In [3]:
def get_prop_name(propertyid):
    df = pd.read_csv('data/properties.csv')
    df = df[df['propertyid']==int(propertyid)]
    return df['propertyname'].values[0], df['unit'].values[0]

get_prop_name(6)

property_ids = ['2', '3', '4', '5', '6', '8']
units = [get_prop_name(pid)[1] for pid in property_ids]

print(units)
def convert_units_to_latex(units):

    if units == 'S*m^(-1)':
        return '$S/m$'
    elif units == 'ohm*m':
        return '$\Omega m$'
    elif units == 'W*m^(-1)*K^(-1)':
        return '$W/(mK)$'
    elif units == 'W*m^(-1)*K^(-2)':
        return '$W/(mK^2)$'
    elif units == 'm^2*V^(-1)*s^(-1)':
        return '$m^2/(Vs)$'
    elif units == 'm^(-3)':
        return '$m^{-3}$'
    elif units == 'm^3*C^(-1)':
        return '$m^3/C$'
    else:
        return units
    return 
[convert_units_to_latex(x) for x in units]

['V/K', 'S*m^(-1)', 'W*m^(-1)*K^(-1)', 'ohm*m', 'W*m^(-1)*K^(-2)', '-']


['V/K', '$S/m$', '$W/(mK)$', '$\\Omega m$', '$W/(mK^2)$', '-']

In [4]:
fp_filters = 'processing_functions/PROPERTY_FILTERS.json'
df = pd.read_json(open(fp_filters, 'r'))
props = ['Seebeck coefficient (S)', 'Electrical conductivity ($\sigma$)', 'Thermal conductivity ($\kappa_{total}$)', 'Power factor (S$^2\sigma$)', 
        'Figure of merit (ZT)', 'Temperature (K)', 'Transport coefficient ($\sigma_{E0}$)']
units = ['$V/K$', '$S/m$', '$W/(mK)$', '$W/(mK^2)$', '-', 'K', '$S/m$']
min_vals = [str(df[k]['min']) for k in df.keys()]
max_vals = [str(df[k]['max']) for k in df.keys()]
print(len(units), len(min_vals))
df_filter_table = pd.DataFrame({'Property':props, 'units':units, 'minimum value':min_vals, 'maximum value':max_vals})
caption = 'Physically-relevant ranges for properties of interest. Property values outside of these ranges were filtered out by the SL pipeline.'
df_filter_table.to_latex('property_ranges_table.tex', caption=caption, label='table:prop_ranges', index=False, escape=False)
df_filter_table

7 7


  df_filter_table.to_latex('property_ranges_table.tex', caption=caption, label='table:prop_ranges', index=False, escape=False)


Unnamed: 0,Property,units,minimum value,maximum value
0,Seebeck coefficient (S),$V/K$,-0.005,0.005
1,Electrical conductivity ($\sigma$),$S/m$,0.0,10000000.0
2,Thermal conductivity ($\kappa_{total}$),$W/(mK)$,0.0,100.0
3,Power factor (S$^2\sigma$),$W/(mK^2)$,0.0,10.0
4,Figure of merit (ZT),-,0.0,3.0
5,Temperature (K),K,200.0,1200.0
6,Transport coefficient ($\sigma_{E0}$),$S/m$,0.0,10000000.0


In [5]:
def get_number_of_records(property_ids, data_type):
    
    base_dir = 'data/'
    if data_type == 'raw data':
        fp = 'rawdata.csv.gz'
        df = pd.read_csv(base_dir+fp, compression='gzip')
        
    elif data_type == 'interpolated':
        fp = 'rawdata_interpolated.csv'
        df = pd.read_csv(base_dir+fp)
        
    elif data_type == 'calculated':
        fp = 'rawdata_interpolated_calc.csv'
        df = pd.read_csv(base_dir+fp)
        
    elif data_type == 'final':
        fp = 'rawdata_interpolated_filtered_with_classifiers.csv'
        df = pd.read_csv(base_dir+fp)
    
    elif data_type == '111-type':
        fp = 'rawdata_interpolated_filtered_with_classifiers.csv'
        df = pd.read_csv(base_dir+fp)
        df = df[df['Composition class']=='111-type']

    elif data_type == '':
        pass
        

    
    n_props = []
        
    for pid in property_ids:
        
        if data_type == 'raw data':
            try:
                n = len(df[df['propertyid_y']==int(pid)].dropna())
            except:
                n = 0
            n_props.append(n)
        else:
            try:
                n = len(df[pid].dropna())
            except:
                n = 0
            n_props.append(n)
            
    return n_props


In [7]:
property_ids = ['2', '3', '4', '5', '6', '8', 'sigma_E_0']
calculated_props = ['sigma_E_0', 'weighted_mobility', 'quality_factor']
data_types = ['raw data', 'interpolated', 'calculated', 'final', '111-type']

prop_names = [get_prop_name(pid)[0] if pid != 'sigma_E_0' else 'Transport coefficient ($\sigma_{E0}$)' for pid in property_ids]
units = [get_prop_name(pid)[1] if pid != 'sigma_E_0' else '$S/m$' for pid in property_ids]
units = [convert_units_to_latex(i) for i in units]

data = {'Property ID':property_ids, 'Property':prop_names, 'Units':units}

for d in data_types:
    n_props = get_number_of_records(property_ids, d)
    data[d]=n_props

df_total = pd.DataFrame(data)
df_total['calculated'] = df_total['calculated'] - df_total['interpolated']
df_total['Property ID'] = df_total['Property ID'].replace('sigma_E_0', 'sigma\_E\_0')
caption = 'Thermoelectric properties extracted from the Starrydata2 database. \
Raw data, interpolated, calculated, and final refer to distinct caches at points in the data ingestion pipeline \
111-type refers to records that are labeled as 111-type by our composition classifier.'
df_total.to_latex('property_count_table.tex', caption=caption, label='table:prop_table', index=False, escape=False)
df_total

  df_total.to_latex('property_count_table.tex', caption=caption, label='table:prop_table', index=False, escape=False)


Unnamed: 0,Property ID,Property,Units,raw data,interpolated,calculated,final,111-type
0,2,Seebeck coefficient,V/K,498527,21508,0,17315,986
1,3,Electrical conductivity,$S/m$,184924,9931,10366,16438,970
2,4,Thermal conductivity,$W/(mK)$,276597,15508,0,12785,789
3,5,Electrical resistivity,$\Omega m$,324818,10399,0,8495,462
4,6,Power factor,$W/(mK^2)$,184900,10570,8529,15437,913
5,8,ZT,-,221091,13730,1713,12794,808
6,sigma\_E\_0,Transport coefficient ($\sigma_{E0}$),$S/m$,0,0,18181,14742,889
