In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import importlib

import itertools as it
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import statsmodels.api as sm

from itertools import product
from scipy.stats import gmean

from criteriaetl.utils.expansion_func import (get_value_counts_with_expansion_factor,
    get_percentage_table_with_expansion_factor)
from criteriaetl.utils.display_func import cdisplay, percentage_count_plot
from criteriaetl.utils.common_func import (get_weighted_complete_randomization_series_on_subset, 
    proportional_cut, weighted_qcut, get_partition_bool_columns_dict)
from criteriaetl.transformers.columns_base import (NameTransformer, 
    ReplaceTransformer, SelectTransformer, AssignTransformer)
from criteriaetl.transformers.rows_base import AggregateTransformer
from criteriaetl.transformers.fusion_base import MergeTransformer

from projectetl.utils.dataload import (load_survey_data_do, save_survey_with_pickle,
                                       load_survey_from_pickle)
from projectetl.utils.config import DATA_DIR

# Load databases

In [None]:
canasta_alimentaria = ['alimentos y bebidas no alcohólicas']
canasta_no_alimentaria = [
    'bebidas alcohólicas y tabaco', 'prendas de vestir y calzado',
    'vivienda', 'muebles', 'salud', 'transporte', 'comunicaciones',
    'recreación y cultura', 'educación', 'restaurantes y hoteles',
    'bienes y servicios diversos'
]

canasta_alimentaria_cols = [f'{base_col_name} indice' for base_col_name in canasta_alimentaria]
canasta_no_alimentaria_cols = [f'{base_col_name} indice' for base_col_name in canasta_no_alimentaria]
canasta_cols = canasta_alimentaria_cols + canasta_no_alimentaria_cols

## Load IPC base 2010 = 100

In [None]:
time_cols = ['anno', 'período']

inflation_2010_raw = load_survey_data_do(DATA_DIR / 'inflation' / 'ipc_grupos_base_2010.csv',
                                         load_func=lambda path: pd.read_csv(str(path)))
inflation_2010_raw = inflation_2010_raw[time_cols + canasta_alimentaria_cols + canasta_no_alimentaria_cols]
inflation_2010_raw.rename(columns={'período': 'periodo'}, inplace=True)
inflation_2010_raw.head()

## Load annual database IPC base 2019 - 2020 = 100

In [None]:
time_cols = ['anno', 'periodo']

inflation_19_20_raw = load_survey_data_do(DATA_DIR / 'inflation' / 'ipc_grupos_base_19_20.csv',
                                          load_func=lambda path: pd.read_csv(str(path)))
inflation_19_20_raw = inflation_19_20_raw[time_cols + canasta_alimentaria_cols + canasta_no_alimentaria_cols]
inflation_19_20_raw.head()

#

# Processing databases

## Replace Transformer

* The `period` column is replaced from spanish names to the corresponding numbers.
* A `date` column is created using `pd.to_datetime`
* The `date` colum is set as index to easily loc a certain date

In [None]:
months = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre']
replace_months_map = {
    'periodo': {month: str(i + 1).zfill(2) for i, month in enumerate(months)}
}

replace_months_transformer = ReplaceTransformer(replace_months_map)
inflation_2010_replaced = replace_months_transformer.transform(inflation_2010_raw)
inflation_19_20_replaced = replace_months_transformer.transform(inflation_19_20_raw)

In [None]:
inflation_2010_replaced['date'] = pd.to_datetime(inflation_2010_replaced['anno'].apply(str) + inflation_2010_replaced['periodo'] + '01',
                                                 format='%Y%m%d')
inflation_2010_replaced = inflation_2010_replaced.set_index('date')[canasta_cols]

inflation_19_20_replaced['date'] = pd.to_datetime(inflation_19_20_replaced['anno'].apply(str) + inflation_19_20_replaced['periodo'] + '01',
                                                  format='%Y%m%d')
inflation_19_20_replaced = inflation_19_20_replaced.set_index('date')[canasta_cols]

## Base IPC 2010 database on september 2016

In [None]:
inflation_2016_10_replaced = inflation_2010_replaced.astype(float) / inflation_2010_replaced.loc['2016-09-01'].astype(float)

In [None]:
inflation_2016_10_replaced.head(10)

In [None]:
inflation_2016_10_replaced.tail(10)

## Base IPC 2019 - 2020 database on 2016

In [None]:
inflation_2016_19_20_replaced = (inflation_19_20_replaced.astype(float) / inflation_19_20_replaced.loc['2020-09-01'].astype(float)) * inflation_2016_10_replaced.loc['2020-09-01']

In [None]:
inflation_2016_19_20_replaced.tail(10)

## Append databases

In [None]:
inflation_2016_appended = inflation_2016_10_replaced.append(inflation_2016_19_20_replaced.loc['2020-10-01':'2021-01-01'])
inflation_2016_appended.tail(10)

## Calculate inflation for basic basket

In [None]:
inflation_basket_2016_assign_map = {
    'alimentaria': lambda df, cols=canasta_alimentaria_cols: df[cols],
    'no_alimentaria': lambda df, cols=canasta_no_alimentaria_cols: gmean(df[cols], axis=1),
    'ampliada': lambda df, cols=canasta_cols: gmean(df[cols], axis=1)
}

inflation_basket_2016_assign_transformer = AssignTransformer(inflation_basket_2016_assign_map)
inflation_basket_2016_assigned = inflation_basket_2016_assign_transformer.transform(inflation_2016_appended)

In [None]:
basket_cols = ['alimentaria', 'no_alimentaria', 'ampliada']
inflation_basket_2016_final = inflation_basket_2016_assigned[basket_cols]

In [None]:
inflation_basket_2016_final.tail()

# Save IPC base 2016

In [None]:
save_survey_with_pickle(inflation_basket_2016_final, DATA_DIR / 'inflation' / 'object' / 'inflation_basket_2016.pkl')

In [None]:
inflation = inflation_basket_2016_final['2020-10-01']
urbana = inflation * 2167.16
rural = inflation * 2076.49
print(f'urbana - {urbana}\nrural - {rural}')