# Cleaning anomalies in time series per region slug



## Load  libraries

In [3]:
# %load first_cell.py
%reload_ext autoreload
%autoreload 2
from pathlib import Path
home = str(Path.home())

import sys
sys.path = sys.path + [f'{home}/.conda/envs/norm_env/lib/python37.zip', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7/lib-dynload', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7/site-packages']
sys.prefix = '/home/soniame/.conda/envs/norm_env'

from paths import RAW_PATH, TREAT_PATH, OUTPUT_PATH, FIGURES_PATH

In [4]:
import pandas as pd

#### Run single

In [None]:
from src.runners.clean import _run_single 

In [None]:
daily, weekly = _run_single(region_slug='aguascalientes', 
                            anomaly_vote_minimun_s1=1, 
                            anomaly_vote_minimun_s2=1, 
                            c_metric = 'max', 
                            print_report = False, 
                            print_plot = False)

In [None]:
#daily[daily.date > '2020-05-19']

In [None]:
import plotnine as p9
p9.theme_set(p9.theme_linedraw()) # default theme

In [None]:
daily.head(3)
gg = (p9.ggplot(daily[daily.date > '2021-01-01'], p9.aes(x = 'date', y='tcp')) 
 + p9.geom_line(size= 1, alpha = .7)
 + p9.geom_line(p9.aes( y='tcp_clean'), color = 'red', size= 1, alpha = .7)
 + p9.theme(figure_size=(6, 3),
            axis_text_x=p9.element_text(size = 7)) )
gg 

In [None]:
daily.head(3)
gg = (p9.ggplot(daily, p9.aes(x = 'date', y='tcp')) 
 + p9.geom_line(size= 1, alpha = .7)
 + p9.theme(figure_size=(6, 3),
            axis_text_x=p9.element_text(size = 7)) )
gg 

In [None]:
weekly.head(3)
gg = (p9.ggplot(weekly[weekly.date_min > '2021-01-01'], p9.aes(x = 'date_min', y='tcp')) 
 + p9.geom_line() 
 + p9.geom_line(p9.aes( y='tcp_clean'), color = 'red')
 + p9.theme(figure_size=(6, 3),
            axis_text_x=p9.element_text(size = 7)) )
gg

In [None]:
print(daily.date.min())
print(daily.date.max())
print(weekly.week.max())

#### Run by batch

In [None]:
from src.runners.clean import _run_batch 

In [None]:
_run_batch(athena_path = "/home/soniame/shared/spd-sdv-omitnik-waze/corona", 
          c_metric = 'max',
          f_metric = 20)

## Results Batch

In [None]:
import pandas as pd
from siuba import *

### Daily

#### Experiments of parameters

Experiments with different parameters to test level shift and outliers

In [None]:
df_list = list()
for k in range(7):
    file = ['ls1', 'ls4', 'ls10', 'ls15', 'ls20', 'ls100', 'lsmix'][k]
    print(file)
    metric = ['', '_ls_4X', '_ls_10X', '_ls_15X', '_ls_20X', '_ls_OFF', '_ls_MIX'][k]
    
    df = pd.read_csv(f"/home/soniame/shared/spd-sdv-omitnik-waze/corona/cleaning/daily/daily_daily_index_max_{file}.csv")
    df = df[['date', 'region_slug', 'tci_observed', 'tcp_observed', 'tci_clean', 'tcp_clean']] \
        .rename(columns = {'tci_clean':f'tci_cleaned{metric}', 
                           'tcp_clean':f'tcp_cleaned{metric}'}) 
    print(df.shape)
    df_list.append(df)
    
from functools import reduce
df_union = reduce(lambda x, y: pd.merge(x, y, on = ['date', 'region_slug', 'tci_observed', 'tcp_observed']), df_list)
print(df_union.shape)
print(df_union.date.max())
df_union.head(2)    

#### Export to csv

In [None]:
df_union.describe()

In [None]:
df_union.to_csv("/home/soniame/private/daily_index_index.csv", index=False)

In [None]:
#print(df_union.date.max())
#df_union[df_union.region_slug == 'lima'].tail()

In [None]:
import plotnine as p9
gg = (p9.ggplot(df_union[df_union.region_slug == 'country_mexico'],
                    p9.aes(x = 'date', y='tci_observed', group = 'region_slug')) 
 + p9.geom_line() 
 + p9.geom_line(p9.aes(y = 'tci_cleaned'), color = 'gray')
 + p9.geom_line(p9.aes(y = 'tci_cleaned_ls_20X'), color = 'red')
 + p9.geom_line(p9.aes(y = 'tci_cleaned_ls_4X'), color = 'blue')
 + p9.geom_line(p9.aes(y = 'tci_cleaned_ls_OFF'), color = 'green')
 + p9.geom_line(p9.aes(y = 'tci_cleaned_ls_MIX'), color = 'orange')
 + p9.geom_line(p9.aes( y='tci_cleaned'), color = 'blue'))
#gg

#### Athena historic

In [7]:
df_union_daily = pd.read_csv("/home/soniame/private/daily_index_index-202105.csv", index_col=0) \
    [['date', 'region_slug', 
      'tci_observed', 'tcp_observed', 
      'tci_cleaned_ls_20X', 'tcp_cleaned_ls_20X']] \
    .rename(columns = {'tci_cleaned_ls_20X':'tci_smoothed', 
                       'tcp_cleaned_ls_20X':'tcp_smoothed'}) 
#df_union_weekly.head(2)

import boto3
import awswrangler as wr
from src import utils
conn = utils.connect_athena(path='../configs/athena.yaml')

s3_path = "s3://iadbprod-public-stata-as-a-service/spd-sdv-omitnik-waze/corona"
res = wr.s3.to_parquet(
        df=df_union_daily,
        path=f"{s3_path}/athena/prod/smooth/prod_daily_daily_smooth_historical",
        dataset=True,
        database='spd_sdv_waze_corona',
        table='prod_daily_daily_smooth_historical',
        mode="overwrite",
        boto3_session=boto3.Session(region_name="us-east-1"),
    )

#### Export update 


In [None]:
df_prev = pd.read_sql_query(f"""
    select 
        date, region_slug, tci_observed, tcp_observed,  tci_smooth, tcp_smooth
    from spd_sdv_waze_corona.prod_daily_daily_smooth_historical""", conn)
df_new = pd.read_csv(f"/home/soniame/shared/spd-sdv-omitnik-waze/corona/cleaning/daily/daily_daily_index_max_ls20.csv")
df_new = df_new[df_new.date > df_prev.date.max()][['date', 'region_slug', 
                                        'tci_observed', 'tcp_observed', 
                                        'tcp_smooth', 'tci_smooth']]

df_daily_update = pd.concat([df_prev, df_new])
df_daily_update.to_csv("/home/soniame/private/daily_index_index_update.csv", index=False)
df_daily_update.head(2)

print(df_daily_update.date.max())
print(df_prev.date.max())
print(df_new.date.max())

In [None]:
import plotnine as p9
gg = (p9.ggplot(df_daily_update[df_daily_update.region_slug == 'buenosaires'],
                    p9.aes(x = 'date', y='tci_observed', group = 'region_slug')) 
 + p9.geom_line() 
 + p9.geom_line(p9.aes(y = 'tci_smooth'), color = 'red'))
#gg

#### Long format

In [None]:
from siuba import *
df_update_long = (df_daily_update 
 >> gather('measure', 'value', _["tci_observed":"tcp_smooth"])
)
df_update_long['type'] = 'TCP'
tci_exist = df_update_long['measure'].isin(['tci_observed',  'tci_smoothed'])
df_update_long['type'][tci_exist] = 'TCI'

df_prev_long = pd.read_csv(f"/home/soniame/private/daily_daily_index_long-202105.csv", index_col=0)
df_prev_long = df_prev_long[df_prev_long.measure.isin(['tci_observed',  'tcp_observed', 'tci_cleaned', 'tcp_cleaned']) == False]

print(df_prev_long.measure.unique())

df_daily_long = pd.concat([df_prev_long, df_update_long])
df_daily_long.measure.unique()

In [None]:
df_daily_long.to_csv("/home/soniame/private/daily_daily_index_long.csv")

In [None]:
gg = (p9.ggplot(df_daily_long[df_daily_long.region_slug == 'country_argentina'],
                    p9.aes(x = 'date', y='value', 
                           colour = 'measure', group = 'measure')) 
 + p9.geom_line() )
#gg

In [None]:
df_daily_long.head()

In [None]:
gg = (p9.ggplot(df_daily_long[(df_daily_long.region_slug == 'country_peru') & 
                              (df_daily_long.type == 'TCI') & 
                              (df_daily_long.date > '2021-02-01') &
                              (df_daily_long.measure.isin(['tci_observed', 'tci_smoothed']))],
                    p9.aes(x = 'date', y='value', 
                           colour = 'measure', group = 'measure')) 
 + p9.geom_line()
 + p9.facet_wrap('region_slug')
 + p9.theme(axis_text_x=p9.element_text(angle=90),
                figure_size=(10, 5) )
)
gg

### Weekly

#### Experiments of parameters

In [None]:
df_list = list()
for k in range(7):
    file = ['ls1', 'ls4', 'ls10', 'ls15', 'ls20', 'ls100', 'lsmix'][k]
    print(file)
    metric = ['', '_ls_4X', '_ls_10X', '_ls_15X', '_ls_20X', '_ls_OFF', '_ls_MIX'][k]
    
    df = pd.read_csv(f"/home/soniame/shared/spd-sdv-omitnik-waze/corona/cleaning/weekly/weekly_weekly_index_max_{file}.csv")
    df = df[['week', 'days_num', 'date_min', 'region_slug', 
             'tci_observed', 'tcp_observed', 'tci_clean', 'tcp_clean']] \
        .rename(columns = {'tci_clean':f'tci_cleaned{metric}', 
                           'tcp_clean':f'tcp_cleaned{metric}'}) 
    print(df.shape)
    df_list.append(df)
    
from functools import reduce
df_union = reduce(lambda x, y: pd.merge(x, y, on = ['week', 'days_num', 'region_slug', 'date_min', 'tci_observed', 'tcp_observed']), df_list)
print(df_union.shape)
df_union.head(2)    

##### Long format experiments

In [None]:
df_weekly = pd.read_csv("/home/soniame/private/weekly_weekly_index.csv")
df_weekly.head(2)

In [None]:
df_weekly_long = (df_weekly 
 >> select('week', 'days_num', 'date_min', 'region_slug', 
           'tci_observed',  'tci_cleaned', 
           'tci_cleaned_ls_4X', 'tci_cleaned_ls_20X',
           'tci_cleaned_ls_10X', 'tci_cleaned_ls_15X',
           'tci_cleaned_ls_OFF', 'tci_cleaned_ls_MIX',
           'tcp_observed', 'tcp_cleaned', 
           'tcp_cleaned_ls_4X', 'tcp_cleaned_ls_20X', 
           'tci_cleaned_ls_10X', 'tci_cleaned_ls_15X',
           'tcp_cleaned_ls_OFF', 'tcp_cleaned_ls_MIX')
 >> gather('measure', 'value', _["tci_observed":"tcp_cleaned_ls_MIX"])
 >> mutate(type = 'TCP')
)
tci_exist = df_weekly_long['measure'].isin(['tci_observed',  'tci_cleaned', 
           'tci_cleaned_ls_4X', 'tci_cleaned_ls_20X', 
           'tci_cleaned_ls_10X', 'tci_cleaned_ls_15X',
           'tci_cleaned_ls_OFF', 'tci_cleaned_ls_MIX'])
df_weekly_long['type'][tci_exist] = 'TCI'

print(df_weekly_long.type.unique())
print(df_weekly_long.measure.unique())
head(df_weekly_long)

In [None]:
#df_weekly_long.to_csv("/home/soniame/private/weekly_weekly_index_long-202105.csv")

#### Export to csv

In [None]:
df_union.describe()

In [None]:
print(df_union.week.max())
print(df_union.date_min.max())
df_union.head()

In [None]:
df_union.to_csv("/home/soniame/private/weekly_weekly_index.csv", index=False)

In [None]:
gg = (p9.ggplot(df_union[df_union.region_slug == 'kingston'],
                    p9.aes(x = 'week', y='tci_observed', group = 'region_slug')) 
 + p9.geom_line() 
 + p9.geom_line(p9.aes( y='tci_cleaned'), color = 'blue'))
#gg

#### Athena historic

In [8]:
df_union_weekly = pd.read_csv("/home/soniame/private/weekly_weekly_index-202105.csv", index_col=0) \
    [['date_min', 'region_slug', 
      'tci_observed', 'tcp_observed', 
      'tci_cleaned_ls_20X', 'tcp_cleaned_ls_20X']] \
    .rename(columns = {'tci_cleaned_ls_20X':'tci_smoothed', 
                       'tcp_cleaned_ls_20X':'tcp_smoothed'}) 
#df_union_weekly.head(2)

import boto3
import awswrangler as wr
from src import utils
conn = utils.connect_athena(path='../configs/athena.yaml')

s3_path = "s3://iadbprod-public-stata-as-a-service/spd-sdv-omitnik-waze/corona"
res = wr.s3.to_parquet(
        df=df_union_weekly,
        path=f"{s3_path}/athena/prod/smooth/prod_daily_weekly_smooth_historical",
        dataset=True,
        database='spd_sdv_waze_corona',
        table='prod_daily_weekly_smooth_historical',
        mode="overwrite",
        boto3_session=boto3.Session(region_name="us-east-1"),
    )

#### Export update

In [None]:
df_prev = pd.read_sql_query(f"""
    select 
        date_min, region_slug, tci_observed, tcp_observed,  tci_smooth, tcp_smooth
    from spd_sdv_waze_corona.prod_daily_weekly_smooth_historical""", conn)

df_prev.date_min.max()
df_prev.head(2)

df_new = pd.read_csv(f"/home/soniame/shared/spd-sdv-omitnik-waze/corona/cleaning/weekly/weekly_weekly_index_max_ls20.csv")
df_new = df_new[df_new.date_min > df_prev.date_min.max()][['date_min', 'region_slug', 
                                        'tci_observed', 'tcp_observed', 
                                        'tcp_clean', 'tci_clean']] \
    .rename(columns = {'tcp_clean':'tcp_smooth', 'tci_clean':'tci_smooth'})
df_new.head(2)

df_weekly_update = pd.concat([df_prev, df_new])
df_weekly_update.to_csv("/home/soniame/private/weekly_index_index_update.csv", index=False)
df_weekly_update.head(2)

print(df_weekly_update.date_min.max())
print(df_prev.date_min.max())
print(df_new.date_min.max())

In [None]:
import plotnine as p9
gg = (p9.ggplot(df_weekly_update[df_weekly_update.region_slug == 'country_argentina'],
                    p9.aes(x = 'date_min', y='tci_observed', group = 'region_slug')) 
 + p9.geom_line() 
 + p9.geom_line(p9.aes(y = 'tci_smooth'), color = 'red'))
#gg

#### Long format 

In [None]:
from siuba import *
df_update_long = (df_weekly_update 
 >> gather('measure', 'value', _["tci_observed":"tcp_smooth"])
)

df_update_long
df_update_long['type'] = 'TCP'
tci_exist = df_update_long['measure'].isin(['tci_observed',  'tci_smooth'])
df_update_long['type'][tci_exist] = 'TCI'

#df_update_long.head()
df_prev_long = pd.read_csv(f"/home/soniame/private/weekly_weekly_index_long-202105.csv", index_col=0)
df_prev_long = df_prev_long[df_prev_long.measure.isin(['tci_observed',  'tcp_observed', 'tci_cleaned', 'tcp_cleaned']) == False]
print(df_prev_long.measure.unique())

df_weekly_long = pd.concat([df_prev_long, df_update_long])
df_weekly_long.measure.unique()

In [None]:
df_weekly_long.to_csv("/home/soniame/private/weekly_weekly_index_long.csv")

In [None]:
gg = (p9.ggplot(df_weekly_long[df_weekly_long.region_slug == 'country_argentina'],
                    p9.aes(x = 'date_min', y='value', 
                           colour = 'measure', group = 'measure')) 
 + p9.geom_line() )
#gg

In [None]:
gg = (p9.ggplot(df_weekly_long[(df_weekly_long.region_slug == 'country_argentina') & 
                              (df_weekly_long.type == 'TCI') & 
                              (df_weekly_long.date_min > '2021-02-01') &
                              (df_weekly_long.measure.isin(['tci_observed', 'tci_smooth']))],
                    p9.aes(x = 'date_min', y='value', 
                           colour = 'measure', group = 'measure')) 
 + p9.geom_line()
 + p9.theme(axis_text_x=p9.element_text(angle=90),
                figure_size=(8, 6) )
)
gg