In [1]:
import os
import cProfile
import pstats
import pandas as pd
import numpy as np
import random

from cities.queries.causal_insight import CausalInsight
from cities.utils.data_grabber import (DataGrabber, list_interventions,
                                       list_outcomes)


smoke_test = "CI" in os.environ
num_samples = 10 if smoke_test else 1000

In [2]:
#sort and save sorted interventions
from cities.utils.cleaning_utils import find_repo_root
from cities.utils.data_grabber import DataGrabber
import matplotlib.pyplot as plt
import numpy as np
import dill as dill
import os

root = find_repo_root()

interventions = list_interventions()
print(interventions)
dg = DataGrabber()

dg.get_features_std_wide(interventions)

interventions_sorted = {}
for intervention in interventions:
    intervention_frame = dg.std_wide[intervention].copy().iloc[:, 2:]
    intervention_frame = intervention_frame.apply(lambda col: col.sort_values().values)
    assert (all(np.diff(intervention_frame[col]) >= 0) for col in intervention_frame.columns), "A column is not increasing."
    interventions_sorted[intervention] = intervention_frame

    
    
with open(os.path.join(root, "data/sorted_interventions", 'interventions_sorted.pkl'), 'wb') as f:
    dill.dump(interventions_sorted, f)
    

['spending_HHS', 'spending_commerce', 'spending_transportation']


In [3]:
def transformed_intervention_from_percentile(intervention, year, percentile):
    
    root = find_repo_root()

    with open(os.path.join(root, "data/sorted_interventions", 'interventions_sorted.pkl'), 'rb') as f:
        interventions_sorted = dill.load(f)
    intervention_frame = interventions_sorted[intervention]

    
    if str(year) not in intervention_frame.columns:
        raise ValueError("Year not in intervention frame.")

    sorted_var = intervention_frame[str(year)]
    n = len(sorted_var)
    index = percentile * (n - 1) / 100
    
    lower_index = int(index)
    upper_index = lower_index + 1
   
    if lower_index == n - 1:
        return sorted_var[lower_index]

    interpolation_factor = index - lower_index
    interpolated_value = ((1 - interpolation_factor) * sorted_var[lower_index] + 
    interpolation_factor * sorted_var[upper_index])

    return interpolated_value



    #return intervention_frame.apply(lambda col: np.percentile(col, percentile)).values

transformed_intervention_from_percentile("spending_commerce", 2015, 50)

-0.0106136555030327

In [4]:

profiler_presorted = cProfile.Profile()

profiler_presorted.enable()
transformed_intervention_from_percentile("spending_commerce", 2015, 50)
profiler_presorted.disable()


profiler_presorted.print_stats(sort='cumulative')

         811 function calls (797 primitive calls) in 0.002 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.002    0.001 interactiveshell.py:3514(run_code)
        2    0.000    0.000    0.002    0.001 {built-in method builtins.exec}
        1    0.000    0.000    0.002    0.002 2713893556.py:1(<module>)
        1    0.000    0.000    0.002    0.002 2725122157.py:1(transformed_intervention_from_percentile)
        1    0.000    0.000    0.001    0.001 _dill.py:281(load)
        1    0.000    0.000    0.001    0.001 _dill.py:441(load)
        1    0.000    0.000    0.001    0.001 {function Unpickler.load at 0x7fe313afacb0}
        6    0.000    0.000    0.000    0.000 base.py:280(_new_Index)
        3    0.000    0.000    0.000    0.000 base.py:478(__new__)
        1    0.000    0.000    0.000    0.000 frame.py:3853(__getitem__)
        1    0.000    0.000    0.000    0.000 frame.py:4402(_ge

In [5]:
def np_run(intervention, year, percentile):
    dg = DataGrabber()
    dg.get_features_std_wide([intervention])
    intervention_frame = dg.std_wide[intervention].copy().iloc[:, 2:]
    intervention_vector = intervention_frame[str(year)]
    value = np.percentile(intervention_vector, percentile)
    return value


profiler_np = cProfile.Profile()

profiler_np.enable()
np_run("spending_commerce", 2015, 50)
profiler_np.disable()


profiler_np.print_stats(sort='cumulative')
    

         2732 function calls (2694 primitive calls) in 0.012 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.012    0.006 interactiveshell.py:3514(run_code)
        2    0.000    0.000    0.012    0.006 {built-in method builtins.exec}
        1    0.000    0.000    0.012    0.012 138065458.py:1(np_run)
        1    0.000    0.000    0.010    0.010 data_grabber.py:27(get_features_std_wide)
        1    0.000    0.000    0.010    0.010 readers.py:848(read_csv)
        1    0.000    0.000    0.010    0.010 readers.py:574(_read)
        1    0.000    0.000    0.008    0.008 readers.py:1732(read)
        1    0.000    0.000    0.007    0.007 c_parser_wrapper.py:222(read)
        1    0.007    0.007    0.007    0.007 {method 'read_low_memory' of 'pandas._libs.parsers.TextReader' objects}
        1    0.000    0.000    0.002    0.002 readers.py:1403(__init__)
        1    0.000    0.000    0.002 

In [6]:
def pandas_run(intervention, year, percentile):
    dg = DataGrabber()
    dg.get_features_std_wide([intervention])
    intervention_frame = dg.std_wide[intervention].copy().iloc[:, 2:]
    intervention_vector = intervention_frame[str(year)]
    value = intervention_vector.quantile(percentile/100)
    return value


profiler_pd = cProfile.Profile()

profiler_pd.enable()
pandas_run("spending_commerce", 2015, 50)
profiler_pd.disable()


profiler_pd.print_stats(sort='cumulative')
    

         3274 function calls (3228 primitive calls) in 0.014 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.014    0.007 interactiveshell.py:3514(run_code)
        2    0.000    0.000    0.014    0.007 {built-in method builtins.exec}
        1    0.000    0.000    0.014    0.014 2809337713.py:1(pandas_run)
        1    0.000    0.000    0.012    0.012 data_grabber.py:27(get_features_std_wide)
        1    0.000    0.000    0.012    0.012 readers.py:848(read_csv)
        1    0.000    0.000    0.011    0.011 readers.py:574(_read)
        1    0.000    0.000    0.009    0.009 readers.py:1732(read)
        1    0.000    0.000    0.007    0.007 c_parser_wrapper.py:222(read)
        1    0.007    0.007    0.007    0.007 {method 'read_low_memory' of 'pandas._libs.parsers.TextReader' objects}
        1    0.000    0.000    0.002    0.002 frame.py:665(__init__)
        1    0.000    0.000    0.00

In [7]:
tt_presorted = pstats.Stats(profiler_presorted).total_tt
tt_np = pstats.Stats(profiler_np).total_tt
tt_pd = pstats.Stats(profiler_pd).total_tt

print(f"The presorted method run in {tt_presorted} seconds." 
      f"This is around {int(tt_np/tt_presorted)} times faster than the numpy method"
      f" and around {int(tt_pd/tt_presorted)} times faster than the pandas method.")

The presorted method run in 0.002001876999999999 seconds.This is around 5 times faster than the numpy method and around 6 times faster than the pandas method.


In [8]:
# now test accuracy 


interventions = list_interventions()
years = [2010, 2015, 2017]

for intervention in interventions:
    for year in years:
        for percentile in [0, 25, 50, 75, 100]:
            assert np.allclose(transformed_intervention_from_percentile(intervention, year, percentile), np_run(intervention, year, percentile), rtol = 0.01)
            assert np.allclose(transformed_intervention_from_percentile(intervention, year, percentile), pandas_run(intervention, year, percentile), rtol = 0.01)

In [9]:
# now let's use the API

data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide["gdp"]
values = [round(i * 0.1, 1) for i in range(1, 10)]
fips = random.choice(gdp["GeoFIPS"])


outcome = "unemployment_rate"
intervention = "spending_commerce"

intervened_value = random.randint(0, 100)
if_perecentile_value = transformed_intervention_from_percentile(intervention, 2015, intervened_value)

print(f"the intervened_value of {intervened_value}, understood as a percentile is {if_perecentile_value}")


#the object instantiation doesn't change
ci = CausalInsight(
    outcome_dataset=outcome,
    intervention_dataset=intervention,
    num_samples=num_samples,
)

ci.get_tau_samples()
ci.get_fips_predictions(intervened_value=intervened_value, fips=fips, 
                        intervention_percentile=True)  # note this!
ci.plot_predictions(range_multiplier=1)




the intervened_value of 96.81525376981043, understood as a percentile is -0.0021472620670566988


ValueError: Percentile must be an integer between 0 and 100.