In [1]:
# %load first_cell.py
%reload_ext autoreload
%autoreload 2
from pathlib import Path
home = str(Path.home())

import sys
sys.path = sys.path + [f'{home}/.conda/envs/norm_env/lib/python37.zip', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7/lib-dynload', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7/site-packages',
                     '../src']
sys.prefix = '/home/joaom/.conda/envs/norm_env'

from paths import RAW_PATH, TREAT_PATH, OUTPUT_PATH, FIGURES_PATH

from copy import deepcopy
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import yaml
import matplotlib.pyplot as plt 
import datetime

import warnings
warnings.filterwarnings('ignore')

# Plotting
import plotly
import plotly.graph_objs as go
import cufflinks as cf
plotly.offline.init_notebook_mode(connected=True)

def iplottitle(title, width=40):
    return '<br>'.join(textwrap.wrap(title, width))

# Setting cufflinks
import textwrap
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
import yaml
from jinja2 import Template
cf.themes.THEMES['custom'] = yaml.load(open('cufflinks_template.yaml', 'r'))

# Imports

In [2]:
sys.path.append('../')
from src import utils
from shapely import wkt
conn = utils.connect_athena(path='../configs/athena.yaml')
import json

In [4]:
df = pd.read_sql_query(f"""
select 
    *
from spd_sdv_waze_corona.dev_daily_hourly_index
""", conn)

In [7]:
df['timestamp'] = df.apply(lambda x: pd.to_datetime(f'2019-{x["month"]}-{x["day"]} {x["hour"]}:00:00'), 1)

In [36]:
df['pre-timestamp'] = df.apply(lambda x: pd.to_datetime(f'2019-3-{x["dow"] + 1} {x["hour"]}:00:00'), 1)

In [44]:
import pytz

In [218]:
def to_timezone(row, ts):
    
    ts = row[ts].replace(tzinfo=pytz.utc)
    
    try:
        return ts.astimezone(row['timezone'])
    except:
        return None
    
df['timestamp_tz'] = df.apply(lambda row: to_timezone(row, 'timestamp') , 1)
df['pre-timestamp_tz'] = df.apply(lambda row: to_timezone(row, 'pre-timestamp') , 1)

In [67]:
filter_hours = [21, 22, 23, 0, 1, 2, 3, 4, 5, 6]

In [80]:
filtered = df[df['timestamp_tz'].apply(lambda x: (x.hour if x is not None else False) not in filter_hours)]

In [87]:
for slug in df['region_name'].unique():

    filtered.query(f'region_name == "{slug}"').set_index('timestamp_tz').sort_index()['tcp']\
        .iplot(theme='custom',
              title=f'TCI Perc. Change {slug} except 21h to 6h',
              yTitle='TCI (%)', 
               asImage=True
              )

In [208]:
def get_key(dct, value, default='rest'):
    
    key = [key for key in dct if value in dct[key]]
    
    if len(key):
        return key[0]
    else:
        return default

In [328]:
def time_chunks(_df, cuts=[5, 10, 16, 22]):
    
    time_group = {f'from {c1}h to {c2}h': list(range(c1, c2)) for c1, c2 in zip(cuts[:-1], cuts[1:])}

    _df['tod'] = _df['timestamp_tz'].apply(lambda x: 
                  get_key(time_group, x.hour)
                  )

    _df['date_tz'] = _df['timestamp_tz'].apply(lambda x: x.date())

    return (
        _df.groupby(['date_tz', 'tod', 'region_name'])['observed'].sum()
        .to_frame().reset_index()
        .pivot_table(columns='tod', values='observed', index=['region_name', 'date_tz'])
    )

In [329]:
regions = ['saopaulo', 'lima', 'bogota', 'buenosaires']

In [330]:
current = df[df['region_slug'].isin(regions)]
current = time_chunks(current)

In [331]:
baseline = (
    df.drop_duplicates(subset=['region_name','pre-timestamp_tz', 'expected_2020'])
    [['region_name', 'region_slug', 'pre-timestamp_tz', 'expected_2020']]
    .rename(columns={'pre-timestamp_tz': 'timestamp_tz', 'expected_2020': 'observed'})
)
baseline = baseline[baseline['region_slug'].isin(regions)]
baseline = time_chunks(baseline)

In [332]:
def add_dow(_df):
    _df = _df.reset_index()
    _df['dow_tz'] = _df['date_tz'].apply(lambda x: x.isoweekday())
    return _df

In [333]:
baseline = baseline.pipe(add_dow).groupby(['region_name', 'dow_tz']).sum().sort_index()

current = current.pipe(add_dow).set_index(['region_name', 'dow_tz']).sort_index()

date = current['date_tz']

tci_pc = current.divide(baseline)

tci_pc['date_tz'] = date

tci_pc.index = tci_pc.index.droplevel('dow_tz')

tci_pc = tci_pc.set_index('date_tz', append=True)

tci_pc = tci_pc.sort_index()

In [342]:
for rn in tci_pc.index.unique('region_name'):
    tci_pc.loc[rn][[c for c in tci_pc.columns if 'rest' not in c]].iplot(
        theme='custom',
        title=f'TCI Perc. Change per Time Chunks for {rn}',
        yTitle='TCI Perc. Change', 
        asImage=True
    )

In [192]:
for rn in tod.index.unique('region_name'):
    tod.loc[rn].iplot(
        theme='custom',
        title=f'TCI per Time Chunks for {rn}',
        yTitle='TCI', 
        asImage=True
    )

In [43]:
for slug in df['region_name'].unique():

    temp = df.query(f'region_name == "{slug}"').drop_duplicates(subset=['pre-timestamp', 'expected_2020'])[['pre-timestamp', 'expected_2020']]\
        .rename(columns={'pre-timestamp': 'timestamp', 'expected_2020': 'observed'}).set_index('timestamp')

    pd.concat([
        df.query(f'region_name == "{slug}"').set_index('timestamp')[['observed']],
        temp]).sort_index()\
        .iplot(theme='custom',
              title=f'TCI {slug}',
              yTitle='TCI (%)',
              asImage=True
              )

In [376]:
regions

['saopaulo', 'lima', 'bogota', 'buenosaires']

In [382]:
index = pd.read_sql_query(f"""
    select 
        *
    from spd_sdv_waze_corona.dev_daily_grouphours_index
""", conn)

In [383]:
index['date'] = index.apply(lambda x: pd.to_datetime(f'2020-{x["month"]}-{x["day"]}',infer_datetime_format=True), 1)

In [384]:
index.head(1)

Unnamed: 0,last_updated_utc,region_slug,region_name,country_name,country_iso_code,country_idb_code,region_type,population,timezone,month,dow,day,hour_chunk,expected_2019,expected_2020,ratio_19,ratio_20,observed,tci,date
0,2020-05-07 15:48:52.517,lima,Lima,Peru,PE,PE,city,9609692,America/Lima,3,2,10,rest,3411032.0,4417970.0,1.202243,0.928229,4100889.0,-7.177075,2020-03-10


In [385]:
s = (
    index[index['region_slug'].isin(regions)]
    .pivot_table(columns='hour_chunk', index=['region_slug', 'date'], values='tci')
    .sort_index()
)

for r in regions:
    
    s.loc[r][[c for c in s.columns if 'rest' not in c]].iplot(
        theme='custom',
        title=f'TCI Perc. Change per Time Chunks for {r}',
        yTitle='TCI Perc. Change', 
#         asImage=True
    )

In [227]:
from IPython.display import Javascript
from nbconvert import HTMLExporter

def save_notebook():
    display(
        Javascript("IPython.notebook.save_notebook()"),
        include=['application/javascript']
    )

def output_HTML(read_file, output_file):
    import codecs
    import nbformat
    exporter = HTMLExporter()
    # read_file is '.ipynb', output_file is '.html'
    output_notebook = nbformat.read(read_file, as_version=4)
    output, resources = exporter.from_notebook_node(output_notebook)
    codecs.open(output_file, 'w', encoding='utf-8').write(output)
    
def save_now():
    
    import time

    save_notebook()
    time.sleep(3)
    current_file = 'CV and Road Length.ipynb'
    output_file = str(OUTPUT_PATH / 'CV+TCI+ROADLENGTH.html')
    output_HTML(current_file, output_file)