# Data Processing - Intermediate to Processed (not Model Ready)

## Preparation

In [1]:
from functools import reduce

import numpy
import pandas

import data_processing

In [2]:
country = 'canada'

source_path = f'../../intermediate/{country}/'
sink_path = '../../processed/'

## Load Intermediate Data

In [3]:
# target
df_unemployment_rate = pandas.read_csv(filepath_or_buffer = f'{source_path}unemployment_rate.csv',)

# energy
df_electricity = pandas.read_csv(filepath_or_buffer = f'{source_path}monthly_electricity_statistics.csv',)

# market activity
df_production = pandas.read_csv(filepath_or_buffer = f'{source_path}imf_production.csv',)
df_prices = pandas.read_csv(filepath_or_buffer = f'{source_path}imf_prices.csv',)
df_labour = pandas.read_csv(filepath_or_buffer = f'{source_path}imf_labour.csv',)

# macro-economics
## figures
df_gdp_current_unadjusted = pandas.read_csv(filepath_or_buffer = f'{source_path}gdp_current_unadjusted.csv',)
df_gdp_current_adjusted = pandas.read_csv(filepath_or_buffer = f'{source_path}gdp_current_adjusted.csv',)
df_gdp_constant_adjusted = pandas.read_csv(filepath_or_buffer = f'{source_path}gdp_constant_adjusted.csv',)
## indicators
df_consumer_confidence = pandas.read_csv(filepath_or_buffer = f'{source_path}consumer_confidence_index.csv',)
df_business_confidence = pandas.read_csv(filepath_or_buffer = f'{source_path}business_confidence_index.csv',)

# central bank and government intervention
df_interest_rates = pandas.read_csv(filepath_or_buffer = f'{source_path}central_banking_interest_rates.csv',)

## Transform

In [4]:
df_unemployment_rate = data_processing.t_unemployment_rate(df_unemployment_rate)

df_electricity = data_processing.t_electricity(df_electricity)

df_production = data_processing.t_production(df_production)
df_prices = data_processing.t_prices(df_prices)
df_labour = data_processing.t_labour(df_labour)

df_gdp_current_unadjusted = data_processing.t_gdp_current(df_gdp_current_unadjusted)
df_gdp_current_adjusted = data_processing.t_gdp_current(df_gdp_current_adjusted)
df_gdp_constant_adjusted = data_processing.t_gdp_constant(df_gdp_constant_adjusted)

df_consumer_confidence = data_processing.t_consumer_confidence(df_consumer_confidence)
df_business_confidence = data_processing.t_business_confidence(df_business_confidence)

df_interest_rates = data_processing.t_interest_rates(df_interest_rates)

## Combine

In [5]:
dfs: list[pandas.DataFrame] = [df_unemployment_rate,
    df_electricity,
    df_production, df_prices, df_labour,
    df_gdp_current_unadjusted, df_gdp_current_adjusted, df_gdp_constant_adjusted,
    df_consumer_confidence, df_business_confidence,
    df_interest_rates
]

In [6]:
df_all = reduce(lambda left, right: pandas.merge(left, right , on = ['Time'], how='outer'), dfs)
df_all = df_all.sort_values(by = 'Time', inplace = False)

## Clean and Test Index

In [7]:
df_all = df_all.replace(to_replace = '...', value = numpy.NaN, inplace = False)

In [8]:
df_all.set_index(keys = 'Time', verify_integrity = True, inplace = False)

Unnamed: 0_level_0,Unemployment_Rate_MEN,Unemployment_Rate_TOT,Unemployment_Rate_WOMEN,Electricity_Distribution_Losses,Electricity_Final_Consumption_(Calculated),Economic_Activity|Industrial_Production|Index|Base_Year-2010=100,Economic_Activity|Industrial_Production|Manufacturing|Index|Base_Year-2010=100,Economic_Activity|Oil_Production|Crude|Index|Base_Year-2010=100,Industrial_Production|Seasonally_adjusted|Index|Base_Year-2010=100,Financial_Market_Prices|Equities|End_of_Period|Index|Base_Year-None,...,Statistical_Discrepancy_in_GDP|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0,Consumer_Confidence,Business_Confidence,Central_Bank_Policy_Rate,Deposit_Rate,Government_Bonds,Lending_Rate,Money_Market_Rate,Savings_Rate,Treasury_Bill_Rate
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1955-01-01,,4.8,,,,,,,,,...,,,,,,,,,,
1955-02-01,,4.7,,,,,,,,,...,,,,,,,,,,
1955-03-01,,4.9,,,,,,,,,...,,,,,,,,,,
1955-04-01,,4.5,,,,,,,,,...,,,,,,,,,,
1955-05-01,,4.5,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-01,5.3,5.1,4.9,2892.6603,47497.8413,,,,,,...,270.266667,96.640571,,,,,,,,
2022-12-01,5.2,5.0,4.8,3293.5636,53997.4223,,,,,,...,254.000000,96.707715,,,,,,,,
2023-01-01,5.1,5.0,4.8,3206.3869,54407.0291,,,,,,...,,96.967366,,,,,,,,
2023-02-01,5.2,5.0,4.9,,,,,,,,...,,97.113943,,,,,,,,


## Save

In [9]:
df_all.to_excel(
    excel_writer = f'{sink_path}{country}.xlsx', index = False
)

## Check Constituent Datasets

In [10]:
df_unemployment_rate.columns

Index(['Time', 'Unemployment_Rate_MEN', 'Unemployment_Rate_TOT',
       'Unemployment_Rate_WOMEN'],
      dtype='object', name='SUBJECT')

In [11]:
df_electricity.columns

Index(['Time', 'Electricity_Distribution_Losses',
       'Electricity_Final_Consumption_(Calculated)'],
      dtype='object', name='Product_Balance')

In [12]:
df_production.columns

Index(['Time',
       'Economic_Activity|Industrial_Production|Index|Base_Year-2010=100',
       'Economic_Activity|Industrial_Production|Manufacturing|Index|Base_Year-2010=100',
       'Economic_Activity|Oil_Production|Crude|Index|Base_Year-2010=100',
       'Industrial_Production|Seasonally_adjusted|Index|Base_Year-2010=100'],
      dtype='object', name='Indicator')

In [13]:
df_prices.columns

Index(['Time',
       'Financial_Market_Prices|Equities|End_of_Period|Index|Base_Year-None',
       'Financial_Market_Prices|Equities|Index|Base_Year-None',
       'Prices|Consumer_Price_Index|All_items|Index|Base_Year-2010=100',
       'Prices|Producer_Price_Index|All_Commodities|Index|Base_Year-2010=100'],
      dtype='object', name='Indicator')

In [14]:
df_labour.columns

Index(['Time', 'Employment|Persons|Number_of', 'Labor_Force|Persons|Number_of',
       'Labor_Markets|Unemployment_Rate|Percent',
       'Unemployment|Persons|Number_of'],
      dtype='object', name='Indicator')

In [15]:
df_gdp_current_unadjusted.columns

Index(['Time', 'Change_in_Inventories|Nominal|Undjusted|Domestic_Currency',
       'Exports_of_Goods_and_Services|Nominal|Undjusted|Domestic_Currency',
       'Government_Consumption_Expenditure|Nominal|Undjusted|Domestic_Currency',
       'Gross_Domestic_Product|Nominal|Undjusted|Domestic_Currency',
       'Gross_Fixed_Capital_Formation|Nominal|Undjusted|Domestic_Currency',
       'Household_Consumption_Expenditure|incl._NPISHs|Nominal|Undjusted|Domestic_Currency',
       'Imports_of_Goods_and_Services|Nominal|Undjusted|Domestic_Currency',
       'Statistical_Discrepancy_in_GDP|Nominal|Undjusted|Domestic_Currency'],
      dtype='object', name='Indicator')

In [16]:
df_gdp_current_adjusted.columns

Index(['Time',
       'Change_in_Inventories|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Exports_of_Goods_and_Services|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Government_Final_Consumption_Expenditure|Nominal|Seasonally_adjusted|Domestic_Currency',
       'Gross_Domestic_Product|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Gross_Fixed_Capital_Formation|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Household_Consumption_Expenditure|incl._NPISHs|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Imports_of_Goods_and_Services|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Statistical_Discrepancy_in_GDP|Nominal|Seasonally_Adjusted|Domestic_Currency'],
      dtype='object', name='Indicator')

In [17]:
df_gdp_constant_adjusted.columns

Index(['Time',
       'Change_in_Inventories|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0',
       'Exports_of_Goods_and_Services|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0',
       'Government_Final_Consumption_Expenditure|Real|Seasonally_adjusted|Domestic_Currency|Base_Year-2012.0',
       'Gross_Domestic_Product|Deflator|Seasonally_Adjusted|Base_Year-2010.0',
       'Gross_Domestic_Product|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0',
       'Gross_Fixed_Capital_Formation|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0',
       'Household_Consumption_Expenditure|incl._NPISHs|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0',
       'Imports_of_Goods_and_Services|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0',
       'Statistical_Discrepancy_in_GDP|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2012.0'],
      dtype='object', name='Indicator')

In [18]:
df_consumer_confidence.columns

Index(['Time', 'Consumer_Confidence'], dtype='object')

In [19]:
df_business_confidence.columns

Index(['Time', 'Business_Confidence'], dtype='object')

In [20]:
df_interest_rates.columns

Index(['Time', 'Central_Bank_Policy_Rate', 'Deposit_Rate', 'Government_Bonds',
       'Lending_Rate', 'Money_Market_Rate', 'Savings_Rate',
       'Treasury_Bill_Rate'],
      dtype='object', name='Indicator')