#### Comparing Reference Data ####

Ensure that you are in the root of tardis-refs

In [125]:
from __future__ import print_function
import pandas as pd
import subprocess
import tempfile
import shlex, os, shutil
import numpy as np

In [74]:
shlex.split('git --work-tree=/Users/wkerzend/tmp/ checkout upstream/pr/8 {cpath}')

['git',
 '--work-tree=/Users/wkerzend/tmp/',
 'checkout',
 'upstream/pr/8',
 '{cpath}']

In [72]:
cd ../

/Users/wkerzend/python/tardis-refdata


In [52]:
def highlight_missing(val):
    if val == True:
        return 'background-color: green'
    else:
        return 'background-color: red'

In [171]:
class ReferenceComparer(object):

    def __init__(self, ref1_hash=None, ref2_hash=None, compare_path='unit_test_data.h5'):
        assert not ((ref1_hash is None) and (ref2_hash is None)), "One hash can not be None"
        self.ref1_hash = ref1_hash
        self.ref2_hash = ref2_hash
        self.compare_path = compare_path
        #subprocess.Popen('git --work-tree=/Users/wkerzend/tmp/ checkout upstream/pr/8 {cpath}', shell=True)
        self.tmp_dir = None
        self.setup()
    
    def setup(self):
        self.tmp_dir = tempfile.mkdtemp()
        print('Created temporary directory at {0}. Delete after use with .teardown'.format(self.tmp_dir))
        for ref_id, ref_hash in enumerate([self.ref1_hash, self.ref2_hash]):
            ref_id += 1
            if ref_hash is not None:
                self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id))
            else:
                subprocess.Popen('cp {0} {1}'.format(self.compare_path, 
                                                     os.path.join(self.tmp_dir, 
                                                                  'ref{0}_{1}'.format(ref_id, self.compare_path))), 
                                                     shell=True)
            setattr(self, 'ref{0}_fname'.format(ref_id), 
                    os.path.join(self.tmp_dir, 'ref{0}_{1}'.format(ref_id, self.compare_path)))
    def teardown(self):
        shutil.rmtree(self.tmp_dir)
    def _copy_data_from_hash(self, ref_hash, prefix):
        git_cmd = ['git']
        git_cmd.append('--work-tree={0}'.format(self.tmp_dir))
        git_cmd += ['checkout', ref_hash, self.compare_path]
        p = subprocess.Popen(git_cmd)
        p.wait()
        shutil.move(os.path.join(self.tmp_dir, self.compare_path), 
                    os.path.join(self.tmp_dir, prefix + self.compare_path))

    def generate_test_table(self):
        rd1_hdfs = pd.HDFStore(self.ref1_fname, mode='r')
        rd2_hdfs = pd.HDFStore(self.ref2_fname, mode='r')
        rd1_keys = rd1_hdfs.keys()
        rd2_keys = rd2_hdfs.keys()
        rd1_hdfs.close()
        rd2_hdfs.close()
        rd1_df = pd.DataFrame(index=rd1_keys, columns=['exists'])
        rd2_df = pd.DataFrame(index=rd2_keys, columns=['exists'])
        rd1_df['exists'] = True
        rd2_df['exists'] = True
        joined_df = rd1_df.join(rd2_df, how='outer', lsuffix='_1', rsuffix='_2')
        joined_df = joined_df.fillna(False)
        return joined_df
    
    def compare_refdata(self, test_table):
        test_table['match'] = None
        for row_id, row in test_table.iterrows():
            if row[['exists_1', 'exists_2']].all():
                ref1_df = pd.read_hdf(self.ref1_fname, row_id)
                ref2_df = pd.read_hdf(self.ref2_fname, row_id)
                if isinstance(ref1_df, pd.Series):
                    try:
                        pd.util.testing.assert_series_equal(ref1_df, ref2_df)
                    except AssertionError:
                        test_table.loc[row_id, 'match'] = False
                    else:
                        test_table.loc[row_id, 'match'] = True

                elif isinstance(ref1_df, pd.DataFrame):
                    try:
                        pd.util.testing.assert_frame_equal(ref1_df, ref2_df)
                    except AssertionError:
                        test_table.loc[row_id, 'match'] = False
                    else:
                        test_table.loc[row_id, 'match'] = True

                else:
                    raise ValueError('Needs to be a Series or DataFrame but is' + str(type(ref1_df)))
        return test_table
                

In [173]:
comparer = ReferenceComparer(ref2_hash='upstream/pr/8')

Created temporary directory at /var/folders/cn/p885gl514_v90bwv3y7q5twr0000gp/T/tmphMQBpk. Delete after use with .teardown


In [172]:
comparer.teardown()

In [174]:
tt = comparer.generate_test_table()

In [176]:
tt = comparer.compare_refdata(tt)

In [177]:
tt.style.applymap(highlight_missing)

Unnamed: 0,exists_1,exists_2,match
/plasma_full/lte/abundance,True,True,True
/plasma_full/lte/atomic_mass,True,True,True
/plasma_full/lte/beta_rad,True,True,True
/plasma_full/lte/beta_sobolev,True,True,True
/plasma_full/lte/density,True,True,True
/plasma_full/lte/electron_densities,True,True,True
/plasma_full/lte/excitation_energy,True,True,True
/plasma_full/lte/f_lu,True,True,True
/plasma_full/lte/g,True,True,True
/plasma_full/lte/g_electron,True,True,True
