In [1]:
# default_exp explain.core

%reload_ext autoreload
%autoreload 2

from nbdev import *

# explain.core

This module contains the core structures for "explanations". I'm using this concept to abstract figures, tables, results, that are generated by pipelines into a useful place. 

An `AbstractExplanation` should:

 - Manage common helper methods and enforce a structure
 - Maybe more later

In [2]:
#export

class AbstractExplanation(object):
    
    short_description_template = 'Abstract Description' #use f-strings for rendering
    id_template = 'AbstractID'
    
    
    save_methods = []
    
    @property
    def short_description(self):
        return self.short_description_template.format(**locals())
    
    @property
    def _id(self):
        return self.id_template.format(**locals())
    
    
    def setup(self):
        pass
    
    def teardown(self):
        pass
    
    def save(self, **kwargs):
        
        for method in self.save_methods:
            method(**kwargs)
        
    def load(self, prefix, **kwargs):
        raise NotImplementedError
        
        
    @classmethod
    def from_prefix(cls, prefix, **kwargs):
        
        empty = cls()
        empty.load(prefix)
        empty.setup()
        return empty
        
                          

A common addition on top of an `AbstractExplanation` is a `DataFrameExplanation`.
This abstracts away saving and loading the data.

Any attributes that are `pd.Series` or `pd.DataFrame` will be captured. To change this behaviour you can subclass and change the `saveable_dfs` logic.

In [9]:
# export

import pandas as pd
import numpy as np
from operator import attrgetter
import glob


class DataFrameExplanation(AbstractExplanation):
    prefix = None
    ignore_attributes = set()
    
    @property
    def save_methods(self):
        return [self.save_data]
    
    @property
    def saveable_dfs(self):
        for attr in dir(self):
            try:
                field = getattr(self, attr, None)
            except TypeError:
                continue
            if (type(field) == pd.DataFrame) or (type(field) == pd.Series):
                yield attr, field    
        
    def save_data(self, prefix=None, fmt = 'csv', index=False,
                  data_save_args = None, **kwargs):
        
        data_save_args = {} if data_save_args is None else data_save_args
        data_save_args.setdefault('index', index)
        if prefix is None:
            prefix = self.prefix
            
        prefix = '' if prefix is None else prefix
        
        for name, df in self.saveable_dfs:
            if name in self.ignore_attributes:
                continue
            path = prefix + name
            if fmt == 'csv':
                df.to_csv(path+'.csv', **data_save_args)
            elif fmt == 'xlsx':
                df.to_excel(path+'.xlsx', **data_save_args)
            else:
                raise ValueError('Did not understand format:' + str(fmt))
    
    def load_data(self, paths = None, prefix=None, fmt = 'csv', 
                  data_read_args = None, **kwargs):
        
        data_read_args = {} if data_read_args is None else data_read_args
        
        if prefix is None:
            prefix = self.prefix
        prefix = '' if prefix is None else prefix
            
        if paths is None:
            paths = glob.glob(prefix + f'*.{fmt}')
            
        print(paths)
                    
        for path in paths:
            
            name = path[len(prefix):-(len(fmt)+1)]
            print(name)
            if fmt == 'csv':
                df = pd.read_csv(path, **data_read_args)
            elif fmt == 'xlsx':
                df = pd.read_excel(path, **data_read_args)
            else:
                raise ValueError('Did not understand format:' + str(fmt))
                
            yield name, df
    
    
    def load(self, prefix, **kwargs):
        
        for key, val in self.load_data(prefix=prefix, **kwargs):
            setattr(self, key, val)
        
        
    

`pd.DataFrames` can be added as any property. They'll get transparently saved with the rest.

In [4]:
from tempfile import TemporaryDirectory

ndata = pd.DataFrame(np.random.randint(5, size = (5,5)))

dfe = DataFrameExplanation()
dfe.data = ndata
dfe.other_data = pd.DataFrame(np.random.randint(5, size = (5,5)))

temp_dir = TemporaryDirectory()

dfe.save(prefix = temp_dir.name + '/', data_save_args = {'index': False})

In [5]:
!cat '{temp_dir.name}/data.csv'

0,1,2,3,4
3,1,1,3,3
2,0,3,1,3
0,2,4,4,2
0,1,0,1,1
2,3,1,2,4


Then, if given a `prefix` it can transparently load all of those files back into the correct place.

In [8]:
ndfe = DataFrameExplanation.from_prefix(temp_dir.name + '/', fmt = 'csv')
temp_dir.cleanup()

['/tmp/tmpu44ktwc7/data.csv', '/tmp/tmpu44ktwc7/other_data.csv']
data
other_data


In [7]:
np.testing.assert_array_equal(dfe.data.values, ndfe.data.values)
np.testing.assert_array_equal(dfe.other_data.values, ndfe.other_data.values)