# Preprocess c file

TCI percentiles per city-country

`Last update`: 2021-04-15

In [89]:
import subprocess
import pandas as pd

def _create_cs_file(path_c):
    
    df = pd.read_csv(path_c) \
        .drop_duplicates('region_slug') \
        .dropna()

    for k in df.columns:
        if k != 'region_slug':
            df[k] = pd.to_numeric(df[k])
        if k == 'region_slug':
            df[k] = df[k].astype(str) 

    df_c = df \
        .assign(c_low_p01 = lambda x: (x['q25']-x['q01'])/(x['q75']-x['q25']),
                c_upp_p99 = lambda x: (x['q99']-x['q75'])/(x['q75']-x['q25']),
                c_low_p05 = lambda x: (x['q25']-x['q05'])/(x['q75']-x['q25']),
                c_upp_p95 = lambda x: (x['q95']-x['q75'])/(x['q75']-x['q25'])
               ) \
        [['region_slug', 'c_low_p01', 'c_upp_p99', 'c_low_p05', 'c_upp_p95']]
    
    df_c['c_min'] = df_c[['c_low_p01', 'c_upp_p99', 'c_low_p05', 'c_upp_p95']].min(axis=1)
    
    return df_c

def _save_cs_file(): 
    
    df_c = _create_cs_file(path_c = '~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr.csv')
    print(df_c.shape)
    print(df_c.describe())
    df_c.to_csv('~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_c_iqr.csv', index=False)
    

## Copy percentile

In [128]:
!cp ~/shared/spd-sdv-omitnik-waze/waze_tci/region_quantiles/quantiles/regions_quantiles.csv ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/raw/regions_quantiles.csv

!sed "s/\[//g;s/\]//g;s/['\"]//g" ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/raw/regions_quantiles.csv > ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr_00.csv

In [129]:
%%bash 

echo -e "region_slug,mean,var_p, var_s,q10,q20,q30,q40,q50,q60,q70,q80,q90,q01,q99,q25,q75,q05,q95" | cat - ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr_00.csv > ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr.csv

In [130]:
!rm ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr_00.csv

In [131]:
!head -2 ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr.csv

region_slug,mean,var_p, var_s,q10,q20,q30,q40,q50,q60,q70,q80,q90,q01,q99,q25,q75,q05,q95
aguascalientes,8039136.868743886,8862843413565.455,8887326406420.607,3829261.1993683483,5605996.229991711,6600013.058713919,6932325.881132038,7473441.530126513,8626943.901942156,9718372.086622398,11136396.36923769,11136396.36923769,2151065.466800198,14393234.680959623,6215087.576892198,10248917.983242217,3349729.7486610203,12964231.367048915


In [132]:
!wc -l ~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr.csv

81 /home/soniame/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_iqr.csv


## Save c's file

In [133]:
_save_cs_file()

(80, 6)
       c_low_p01  c_upp_p99  c_low_p05  c_upp_p95      c_min
count  80.000000  80.000000  80.000000  80.000000  80.000000
mean    0.830407   1.428594   0.624415   0.723806   0.512360
std     0.389538   0.837872   0.321490   0.316299   0.207626
min     0.213163   0.506459   0.165875   0.308937   0.165875
25%     0.524155   0.904405   0.392484   0.508035   0.367157
50%     0.681661   1.227612   0.514479   0.646442   0.489973
75%     1.098601   1.641614   0.778167   0.837884   0.620721
max     1.766687   5.851183   1.580424   2.021423   1.482831


In [134]:
df = pd.read_csv('~/shared/spd-sdv-omitnik-waze/corona/cleaning/data/staging/cities_c_iqr.csv')

In [135]:
df

Unnamed: 0,region_slug,c_low_p01,c_upp_p99,c_low_p05,c_upp_p95,c_min
0,aguascalientes,1.007485,1.027390,0.710332,0.673135,0.673135
1,arequipa,1.370780,1.318878,0.967781,0.620649,0.620649
2,asuncion,0.615222,1.589508,0.503198,0.604833,0.503198
3,barcelona,0.564193,1.984999,0.447251,1.037351,0.447251
4,barquisimeto,0.590701,1.705025,0.494875,0.761201,0.494875
...,...,...,...,...,...,...
75,tijuana,0.902337,1.229780,0.901946,0.596387,0.596387
76,toluca,1.073920,1.020487,0.755596,0.557825,0.557825
77,torreon,0.560243,0.784957,0.507468,0.552098,0.507468
78,valencia,0.315810,2.233693,0.259911,0.989889,0.259911


In [136]:
print(len(df.region_slug.to_list()))
#df.region_slug.to_list()

80


### Regions missing

In [137]:
# %load first_cell.py
%reload_ext autoreload
%autoreload 2
#from pathlib import Path
#home = str(Path.home())

import sys
sys.path.insert(0, '../')

from src import utils

In [138]:
conn = utils.connect_athena(path='../configs/athena.yaml')

  athena_config = yaml.load(open(path, "r"))


In [139]:
qry = """
    select distinct region_slug
    from spd_sdv_waze_corona.prod_daily_daily_index
"""
dash_regions = pd.read_sql_query(qry, conn).sort_values('region_slug').region_slug.unique()

In [140]:
cpar_regions = df.region_slug.to_list()

In [141]:
a = set(dash_regions)
b = set(cpar_regions)
len(a.intersection(b))

56

In [142]:
a.difference(b)

{'bogota',
 'br_states_acre',
 'br_states_alagoas',
 'br_states_amapa',
 'br_states_amazonas',
 'br_states_bahia',
 'br_states_ceara',
 'br_states_distrito_federal',
 'br_states_espirito_santo',
 'br_states_goias',
 'br_states_maranhao',
 'br_states_mato_grosso',
 'br_states_mato_grosso_do_sul',
 'br_states_minas_gerais',
 'br_states_para',
 'br_states_paraiba',
 'br_states_parana',
 'br_states_pernambuco',
 'br_states_piaui',
 'br_states_rio_de_janeiro',
 'br_states_rio_grande_do_norte',
 'br_states_rio_grande_do_sul',
 'br_states_rondonia',
 'br_states_roraima',
 'br_states_santa_catarina',
 'br_states_sao_paulo',
 'br_states_sergipe',
 'br_states_tocantins',
 'country_argentina',
 'country_barbados',
 'country_brazil',
 'country_chile',
 'country_colombia',
 'country_costa_rica',
 'country_dominican_republic',
 'country_ecuador',
 'country_el_salvador',
 'country_guatemala',
 'country_honduras',
 'country_jamaica',
 'country_mexico',
 'country_nicaragua',
 'country_panama',
 'countr