In [625]:
from typing import Callable, Sequence, Optional
from functools import reduce
import pandas as pd
import numpy as np
import altair as alt
from datetime import datetime
PATH = 'rates-sparse.csv'
BASE_DF = pd.read_csv(PATH)

class DataFramePipeFailure(Exception): 
    pass

def pipe(
    functions: Sequence[Callable[[pd.DataFrame], Optional[pd.DataFrame]]], 
    dataframe: pd.DataFrame
) -> pd.DataFrame: 
    def f_after_g(
        f: Callable[[pd.DataFrame], pd.DataFrame], 
        g: Callable[[pd.DataFrame], pd.DataFrame]) -> Callable[[pd.DataFrame], pd.DataFrame]: 
        def maybe_df(df: Optional[pd.DataFrame]) -> Optional[pd.DataFrame]: 
            df = g(df)
            if df is not None:
                df = f(df)
                return df
            return None
            
        return maybe_df
    
    return reduce(f_after_g, functions)(dataframe)

class SafePiper: 
    func: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None
    def __init__(self, kind: str = 'print'): 
        if kind not in ('print', 'raise'): 
            raise ValueError(f'please supply print or raise to {self.__class__}')
        self.kind = kind

    def __call__(self, *arguments: Optional[pd.DataFrame]) -> pd.DataFrame: 
        if not self.func: 
            self.func = arguments[0]
            return self
        try: 
            return self.func(*arguments)
        except Exception as exc: 
            failure = DataFramePipeFailure(
                    f'Pipe failed at function {self.func.__name__} because {exc.__repr__()}: {exc}'
                )
            if self.kind == 'print': 
                print(failure)
                return None
            if self.kind == 'raise': 
                raise failure


In [629]:
@SafePiper('print')
def drop_rows(dataframe: pd.DataFrame) -> pd.DataFrame: 
    return dataframe.iloc[0:10]

@SafePiper('print')
def percentage_parse(dataframe: pd.DataFrame) -> pd.DataFrame: 
    def maybe_float(x: str) -> float: 
        try: 
            y = float(x.strip()[:-1])
        except AttributeError: 
            y = x
        finally:
            return y
        
    
    dataframe.loc[:,'2000'] = dataframe[['2000']].applymap(lambda x: maybe_float(x)/100)
    
    return dataframe.assign(
        **{year: dataframe[[year]].applymap(lambda x: maybe_float(x)/100) 
           for year 
           in (f'20{k:02d}' for k in range(7,21))
          }
    )

@SafePiper('print')
def set_index(dataframe: pd.DataFrame) -> pd.DataFrame: 
    dataframe = dataframe.rename(columns={'Unnamed: 0': 'figure', 'Unnamed: 1': 'bound'})
    for idx, val in dataframe.figure.iteritems():
        if isinstance(val, str): 
            continue
        elif np.isnan(val): 
            dataframe.loc[idx, 'figure'] = dataframe.loc[idx-1, 'figure']
            
    return (dataframe
            .set_index(['figure', 'bound'])
           ).T

@SafePiper('print')
def column_name_format(dataframe: pd.DataFrame) -> pd.DataFrame: 
    return dataframe.rename(columns = lambda x: x.lower().replace(' ', '_'))

@SafePiper('print')
def deconfidence_interpollate_newprescriptionperamerican(dataframe: pd.DataFrame) -> pd.DataFrame: 
    data = dataframe[['rate_of_new_prescription_per_american']]
    for year,lower_bound in data[[('rate_of_new_prescription_per_american', 'lowerbound')]].T.iteritems(): 
        if year == '2000': 
            continue
        if np.isnan(lower_bound.values[0]): 
            dataframe.loc[year, ('rate_of_new_prescription_per_american', 'lowerbound')] = \
                dataframe.loc[str(int(year) - 1), ('rate_of_new_prescription_per_american', 'lowerbound')] - 0.01
            
    for year,upper_bound in data[[('rate_of_new_prescription_per_american', 'upperbound')]].T.iteritems(): 
        if year == '2000': 
            continue
        if np.isnan(upper_bound.values[0]): 
            dataframe.loc[year, ('rate_of_new_prescription_per_american', 'upperbound')] = \
                dataframe.loc[str(int(year) - 1), ('rate_of_new_prescription_per_american', 'upperbound')] + 0.01
            
    return dataframe

@SafePiper('print')
def deconfidence_interpollate_addictionperprescription(dataframe: pd.DataFrame) -> pd.DataFrame: 
    pivot = '2015' # year at which we have hard data
    data = dataframe[['rate_of_addiction_per_prescription']]
    known = data.loc[pivot, 'rate_of_addiction_per_prescription']
    for year, lower_bound in data[[('rate_of_addiction_per_prescription', 'lowerbound')]].T.iteritems(): 
        if year in ('2000', pivot): 
            continue
        dataframe.loc[year, ('rate_of_addiction_per_prescription', 'lowerbound')] = \
            known.lowerbound - 0.01 * abs(int(year) - int(pivot))
        
    for year, upper_bound in data[[('rate_of_addiction_per_prescription', 'upperbound')]].T.iteritems(): 
        if year in ('2000', pivot): 
            continue
        dataframe.loc[year, ('rate_of_addiction_per_prescription', 'upperbound')] = \
            known.upperbound + 0.01 * abs(int(year) - int(pivot))
        
    return dataframe

@SafePiper('print')
def deconfidence_interpollate_overdoseperaddiction(dataframe: pd.DataFrame) -> pd.DataFrame: 
    start = '2000'
    data = dataframe[['rate_of_overdose_per_addiction']]
    known = data.loc[start, 'rate_of_overdose_per_addiction']
    for year, _ in data[[('rate_of_overdose_per_addiction', 'lowerbound')]].T.iteritems(): 
        if year == start: 
            continue
        dataframe.loc[year, ('rate_of_overdose_per_addiction', 'lowerbound')] = \
            known.lowerbound - 0.01 * abs(int(year) - int(start))
    
    for year, _ in data[[('rate_of_overdose_per_addiction', 'upperbound')]].T.iteritems(): 
        if year == start: 
            continue
        dataframe.loc[year, ('rate_of_overdose_per_addiction', 'upperbound')] = \
            known.upperbound + 0.01 * abs(int(year) - int(start))
    
    return dataframe

@SafePiper('print')
def deconfidence_interpollate_deathperoverdose(dataframe: pd.DataFrame) -> pd.DataFrame: 
    start = '2013'
    end = '2018'
    variance = dataframe.rate_of_death_per_overdose.lowerbound.var()
    data = dataframe.rate_of_death_per_overdose
    known_start = data.loc[start]
    known_end = data.loc[end]

    for year, _ in data.lowerbound.T.iteritems():
        if int(start) <= int(year) <= int(end): 
            continue
        if int(year) < int(start): 
            val = known_start.lowerbound - abs(int(year) - int(start)) * variance
        elif int(year) > int(end): 
            val = known_end.lowerbound - abs(int(year) - int(end)) * variance
        dataframe.loc[year, ('rate_of_death_per_overdose', 'lowerbound')] = val
    
    for year, _ in data.upperbound.T.iteritems():
        if int(start) <= int(year) <= int(end): 
            continue
        if int(year) < int(start): 
            val = known_start.upperbound + abs(int(year) - int(start)) * variance
        elif int(year) > int(end): 
            val = known_end.upperbound + abs(int(year) - int(end)) * variance
        dataframe.loc[year, ('rate_of_death_per_overdose', 'upperbound')] = val
    
    return dataframe

@SafePiper('print')
def drop_2000(dataframe: pd.DataFrame) -> pd.DataFrame: 
    return dataframe.drop('2000')

@SafePiper('print')
def bound_to_int(dataframe: pd.DataFrame) -> pd.DataFrame: 
    dataframe.index = map(lambda x: datetime(year=int(x), month=1, day=1), dataframe.index)
    return dataframe

In [630]:
functions = (
    bound_to_int,
    drop_2000,
    deconfidence_interpollate_deathperoverdose,
    deconfidence_interpollate_overdoseperaddiction,
    deconfidence_interpollate_addictionperprescription,
    deconfidence_interpollate_newprescriptionperamerican, 
    column_name_format,
    set_index, 
    percentage_parse, 
    drop_rows
)

df = pipe(functions, BASE_DF)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


figure,rate_of_new_prescription_per_american,rate_of_new_prescription_per_american,rate_of_addiction_per_prescription,rate_of_addiction_per_prescription,rate_of_overdose_per_addiction,rate_of_overdose_per_addiction,rate_of_death_per_overdose,rate_of_death_per_overdose,economic_cost_per_overdose_death,economic_cost_per_overdose_death
bound,lowerbound,upperbound,lowerbound,upperbound,lowerbound,upperbound,lowerbound,upperbound,lowerbound,upperbound
2007-01-01,0.759,0.759,0.0,0.2,0.09,0.46,0.085679,0.087521,,
2008-01-01,0.782,0.782,0.01,0.19,0.08,0.47,0.085832,0.087368,,
2009-01-01,0.795,0.795,0.02,0.18,0.07,0.48,0.085986,0.087214,,
2010-01-01,0.812,0.812,0.03,0.17,0.06,0.49,0.086139,0.087061,,
2011-01-01,0.809,0.809,0.04,0.16,0.05,0.5,0.086293,0.086907,,
2012-01-01,0.813,0.813,0.05,0.15,0.04,0.51,0.086446,0.086754,,
2013-01-01,0.781,0.781,0.06,0.14,0.03,0.52,0.0866,0.0866,,
2014-01-01,0.756,0.756,0.07,0.13,0.02,0.53,0.1202,0.1202,,
2015-01-01,0.706,0.706,0.08,0.12,0.01,0.54,0.0982,0.0982,,
2016-01-01,0.665,0.665,0.07,0.13,0.0,0.55,0.1136,0.1136,,


In [628]:
cols = df.columns.get_level_values('figure')[:-2]
colors = {column: color for column,color in zip(cols, ['red', 'blue', 'green', 'yellow', 'purple', 'orange'])}

dfs = {key: df[key].reset_index() for key in cols}
charts_ = {key: alt.Chart(val, title=key).mark_area(opacity=0.5, color=colors[key]).encode(x='index:T') for key,val in dfs.items()}
charts = {key: val.encode(y='lowerbound:Q', y2='upperbound:Q') for key,val in charts_.items()}

reduce(lambda A,B: A + B, charts.values())

KeyError: 'rate_of_death_per_overdose'

In [None]:
dfs['rate_of_addiction_per_prescription']

In [587]:
df.to_csv('rates_with_uncertainty.csv')