# Data Processing - Intermediate to Processed (not Model Ready)

## Preparation

In [1]:
from functools import reduce

import numpy
import pandas

import data_processing

In [2]:
country = 'italy'

source_path = f'../../intermediate/{country}/'
sink_path = '../../processed/'

## Load Intermediate Data

In [3]:
# target
df_unemployment_rate = pandas.read_csv(filepath_or_buffer = f'{source_path}unemployment_rate.csv',)

# energy
df_electricity = pandas.read_csv(filepath_or_buffer = f'{source_path}monthly_electricity_statistics.csv',)
df_gas_trade_balance = pandas.read_csv(filepath_or_buffer = f'{source_path}gas_trade_balance.csv',)

# market activity
df_production = pandas.read_csv(filepath_or_buffer = f'{source_path}imf_production.csv',)
df_prices = pandas.read_csv(filepath_or_buffer = f'{source_path}imf_prices.csv',)
df_labour = pandas.read_csv(filepath_or_buffer = f'{source_path}imf_labour.csv',)

# macro-economics
## figures
df_gdp_current_unadjusted = pandas.read_csv(filepath_or_buffer = f'{source_path}gdp_current_unadjusted.csv',)
df_gdp_current_adjusted = pandas.read_csv(filepath_or_buffer = f'{source_path}gdp_current_adjusted.csv',)
df_gdp_constant_unadjusted = pandas.read_csv(filepath_or_buffer = f'{source_path}gdp_constant_unadjusted.csv',)
df_gdp_constant_adjusted = pandas.read_csv(filepath_or_buffer = f'{source_path}gdp_constant_adjusted.csv',)
## indicators
df_consumer_confidence = pandas.read_csv(filepath_or_buffer = f'{source_path}consumer_confidence_index.csv',)
df_business_confidence = pandas.read_csv(filepath_or_buffer = f'{source_path}business_confidence_index.csv',)

# central bank and government intervention
df_interest_rates = pandas.read_csv(filepath_or_buffer = f'{source_path}central_banking_interest_rates.csv',)

## Transform

In [4]:
df_unemployment_rate = data_processing.t_unemployment_rate(df_unemployment_rate)

df_electricity = data_processing.t_electricity(df_electricity)
df_gas_trade_balance = data_processing.t_gas_trade_balance(df_gas_trade_balance)

df_production = data_processing.t_production(df_production)
df_prices = data_processing.t_prices(df_prices)
df_labour = data_processing.t_labour(df_labour)

df_gdp_current_unadjusted = data_processing.t_gdp_current(df_gdp_current_unadjusted)
df_gdp_current_adjusted = data_processing.t_gdp_current(df_gdp_current_adjusted)
df_gdp_constant_unadjusted = data_processing.t_gdp_constant(df_gdp_constant_unadjusted)
df_gdp_constant_adjusted = data_processing.t_gdp_constant(df_gdp_constant_adjusted)

df_consumer_confidence = data_processing.t_consumer_confidence(df_consumer_confidence)
df_business_confidence = data_processing.t_business_confidence(df_business_confidence)

df_interest_rates = data_processing.t_interest_rates(df_interest_rates)

## Combine

In [5]:
dfs: list[pandas.DataFrame] = [df_unemployment_rate,
    df_electricity, df_gas_trade_balance,
    df_production, df_prices, df_labour,
    df_gdp_current_unadjusted, df_gdp_current_adjusted, df_gdp_constant_unadjusted, df_gdp_constant_adjusted,
    df_consumer_confidence, df_business_confidence,
    df_interest_rates
]

In [6]:
df_all = reduce(lambda left, right: pandas.merge(left, right , on = ['Time'], how='outer'), dfs)
df_all = df_all.sort_values(by = 'Time', inplace = False)

## Clean and Test Index

In [7]:
df_all = df_all.replace(to_replace = '...', value = numpy.NaN, inplace = False)

In [8]:
df_all.set_index(keys = 'Time', verify_integrity = True, inplace = False)

Unnamed: 0_level_0,Unemployment_Rate_MEN,Unemployment_Rate_TOT,Unemployment_Rate_WOMEN,Electricity_Distribution_Losses,Electricity_Final_Consumption_(Calculated),Natural_Gas_Trade_Balance,Economic_Activity|Industrial_Production|Index|Base_Year-2010=100,Economic_Activity|Industrial_Production|Manufacturing|Index|Base_Year-2010=100,Industrial_Production|Seasonally_adjusted|Index|Base_Year-2010=100,Financial_Market_Prices|Equities|Index|Base_Year-None,...,Harmonized_Euro_Area_Rates|Loans|Non-Financial_Corporations|Up_to_1_Year,Harmonized_Euro_Area_Rates|New_Business|Deposits|Households|Agreed_Maturity|Up_to_1_Year,Harmonized_Euro_Area_Rates|New_Business|Deposits|Non-financial_Corporations|Agreed_Maturity|Up_to_1_Year,Harmonized_Euro_Area_Rates|New_Business|Loans|Households|Consumption|Floating_Rate_and_up_to_1_Year,Harmonized_Euro_Area_Rates|New_Business|Loans|Households|House_Purchase|Over_5_Years,Harmonized_Euro_Area_Rates|Outstanding_Amounts|Deposits|Households|Agreed_Maturity|Up_to_2_Years,Harmonized_Euro_Area_Rates|Outstanding_Amounts|Deposits|Non-Financial_Corporations|Agreed_Maturity|Up_to_2_Years,Lending_Rate,Money_Market_Rate,Treasury_Bill_Rate
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1962-02-01,,,,,,,,,,,...,,,,,,,,,,
1962-03-01,,,,,,,,,,,...,,,,,,,,,,
1962-04-01,,,,,,,,,,,...,,,,,,,,,,
1962-05-01,,,,,,,,,,,...,,,,,,,,,,
1962-06-01,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-01,6.8,7.9,9.3,1484.7093,23653.7756,4541.700,99.930923,98.294791,95.825603,138.055337540356,...,,,,,,,,3.108,,2.523
2022-12-01,6.9,7.9,9.2,1482.4691,23660.6184,5157.992,,,,138.216789868152,...,,,,,,,,3.37,,2.6686
2023-01-01,6.8,8.0,9.5,1553.0938,24753.7069,5190.100,,,,145.791543873083,...,,,,,,,,3.894775,,2.9873
2023-02-01,6.9,8.0,9.4,,,1108.760,,,,154.49469713933,...,,,,,,,,4.127,,3.1636


## Save

In [9]:
df_all.to_excel(
    excel_writer = f'{sink_path}{country}.xlsx', index = False
)

## Check Constituent Datasets

In [10]:
df_unemployment_rate.columns

Index(['Time', 'Unemployment_Rate_MEN', 'Unemployment_Rate_TOT',
       'Unemployment_Rate_WOMEN'],
      dtype='object', name='SUBJECT')

In [11]:
df_electricity.columns

Index(['Time', 'Electricity_Distribution_Losses',
       'Electricity_Final_Consumption_(Calculated)'],
      dtype='object', name='Product_Balance')

In [12]:
df_gas_trade_balance.columns

Index(['Time', 'Natural_Gas_Trade_Balance'], dtype='object', name='Trade_Direction')

In [13]:
df_production.columns

Index(['Time',
       'Economic_Activity|Industrial_Production|Index|Base_Year-2010=100',
       'Economic_Activity|Industrial_Production|Manufacturing|Index|Base_Year-2010=100',
       'Industrial_Production|Seasonally_adjusted|Index|Base_Year-2010=100'],
      dtype='object', name='Indicator')

In [14]:
df_prices.columns

Index(['Time', 'Financial_Market_Prices|Equities|Index|Base_Year-None',
       'Prices|Consumer_Price_Index|All_items|Index|Base_Year-2010=100',
       'Prices|Producer_Price_Index|All_Commodities|Index|Base_Year-2010=100'],
      dtype='object', name='Indicator')

In [15]:
df_labour.columns

Index(['Time', 'Labor_Markets|Unemployment_Rate|Percent',
       'Labor_Markets|Wage_Rates|Index', 'Unemployment|Persons|Number_of'],
      dtype='object', name='Indicator')

In [16]:
df_gdp_current_unadjusted.columns

Index(['Time', 'Change_in_Inventories|Nominal|Undjusted|Domestic_Currency',
       'Exports_of_Goods_and_Services|Nominal|Undjusted|Domestic_Currency',
       'Government_Consumption_Expenditure|Nominal|Undjusted|Domestic_Currency',
       'Gross_Domestic_Product|Nominal|Undjusted|Domestic_Currency',
       'Gross_Fixed_Capital_Formation|Nominal|Undjusted|Domestic_Currency',
       'Household_Consumption_Expenditure|incl._NPISHs|Nominal|Undjusted|Domestic_Currency',
       'Imports_of_Goods_and_Services|Nominal|Undjusted|Domestic_Currency',
       'Statistical_Discrepancy_in_GDP|Nominal|Undjusted|Domestic_Currency'],
      dtype='object', name='Indicator')

In [17]:
df_gdp_current_adjusted.columns

Index(['Time',
       'Change_in_Inventories|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Exports_of_Goods_and_Services|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Government_Final_Consumption_Expenditure|Nominal|Seasonally_adjusted|Domestic_Currency',
       'Gross_Domestic_Product|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Gross_Fixed_Capital_Formation|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Household_Consumption_Expenditure|incl._NPISHs|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Imports_of_Goods_and_Services|Nominal|Seasonally_Adjusted|Domestic_Currency',
       'Statistical_Discrepancy_in_GDP|Nominal|Seasonally_Adjusted|Domestic_Currency'],
      dtype='object', name='Indicator')

In [18]:
df_gdp_constant_unadjusted.columns

Index(['Time',
       'Change_in_Inventories|Real|Undjusted|Domestic_Currency|Base_Year-2010.0',
       'Exports_of_Goods_and_Services|Real|Undjusted|Domestic_Currency|Base_Year-2010.0',
       'Government_Consumption_Expenditure|Real|Undjusted|Domestic_Currency|Base_Year-2010.0',
       'Gross_Domestic_Product|Real|Undjusted|Domestic_Currency|Base_Year-2010.0',
       'Gross_Fixed_Capital_Formation|Real|Undjusted|Domestic_Currency|Base_Year-2010.0',
       'Household_Consumption_Expenditure|incl._NPISHs|Real|Undjusted|Domestic_Currency|Base_Year-2010.0',
       'Imports_of_Goods_and_Services|Real|Undjusted|Domestic_Currency|Base_Year-2010.0'],
      dtype='object', name='Indicator')

In [19]:
df_gdp_constant_adjusted.columns

Index(['Time',
       'Change_in_Inventories|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2010.0',
       'Exports_of_Goods_and_Services|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2010.0',
       'Government_Final_Consumption_Expenditure|Real|Seasonally_adjusted|Domestic_Currency|Base_Year-2010.0',
       'Gross_Domestic_Product|Deflator|Seasonally_Adjusted|Base_Year-2010.0',
       'Gross_Domestic_Product|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2010.0',
       'Gross_Fixed_Capital_Formation|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2010.0',
       'Household_Consumption_Expenditure|incl._NPISHs|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2010.0',
       'Imports_of_Goods_and_Services|Real|Seasonally_Adjusted|Domestic_Currency|Base_Year-2010.0'],
      dtype='object', name='Indicator')

In [20]:
df_consumer_confidence.columns

Index(['Time', 'Consumer_Confidence'], dtype='object')

In [21]:
df_business_confidence.columns

Index(['Time', 'Business_Confidence'], dtype='object')

In [22]:
df_interest_rates.columns

Index(['Time', 'Government_Bonds',
       'Harmonized_Euro_Area_Rates|Loans|Households|Consumer_Credit_and_Other|Up_to_1_Year',
       'Harmonized_Euro_Area_Rates|Loans|Households|House_Purchase|Over_5_Years',
       'Harmonized_Euro_Area_Rates|Loans|Non-Financial_Corporations|Up_to_1_Year',
       'Harmonized_Euro_Area_Rates|New_Business|Deposits|Households|Agreed_Maturity|Up_to_1_Year',
       'Harmonized_Euro_Area_Rates|New_Business|Deposits|Non-financial_Corporations|Agreed_Maturity|Up_to_1_Year',
       'Harmonized_Euro_Area_Rates|New_Business|Loans|Households|Consumption|Floating_Rate_and_up_to_1_Year',
       'Harmonized_Euro_Area_Rates|New_Business|Loans|Households|House_Purchase|Over_5_Years',
       'Harmonized_Euro_Area_Rates|Outstanding_Amounts|Deposits|Households|Agreed_Maturity|Up_to_2_Years',
       'Harmonized_Euro_Area_Rates|Outstanding_Amounts|Deposits|Non-Financial_Corporations|Agreed_Maturity|Up_to_2_Years',
       'Lending_Rate', 'Money_Market_Rate', 'Treasury_Bill