# PCA + OLS estimation: DFM
---

> A partir dos componentes principais, que são os estimadores dos fatores, extraídos no script 'dfm_pca.ipynb', podemos estimar o primeiro modelo por MQO, em uma regressão do PIB em k fatores.

## Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

import statsmodels.api as sm
import statsmodels.formula.api as smf

from stargazer.stargazer import Stargazer
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.stats.diagnostic import het_breuschpagan

## Carregando os dados 

In [5]:
# Carregando dados tratados e separando a variável-alvo: o PIB

data = pd.read_csv('../../data/data_tf.csv')
# data['Date'] = pd.date_range(start = '2002-03-01', end = '2022-09-01', freq = 'QS-DEC') --> se fosse para trimestre 
data['Date'] = pd.date_range(start = '2002-01-01', end = '2022-09-01', freq = 'MS')
data.set_index('Date', inplace = True)
gdp = data['gdp']
qgdp = (gdp.groupby(pd.PeriodIndex(gdp.index, freq = 'Q'), axis = 0)).mean()

# Carregando os dados tranformados com PCA

data_pca = pd.read_csv('../../data/data_tfpca.csv')
# data['Date'] = pd.date_range(start = '2002-03-01', end = '2022-09-01', freq = 'QS-DEC') --> se fosse para trimestre 
data['Date'] = pd.date_range(start = '2002-01-01', end = '2022-09-01', freq = 'MS')
data_pca.set_index('Date', inplace = True)
data_pca.columns = data_pca.columns.str.replace(r"[C]", "F")

# Selecionando apenas um fator da base de dados, que iremos utilizar apenas para esse modelo

data_pca = data_pca[['F1', 'F2', 'F3', 'F4', 'F5']]

# Agrupando os fatores pela média para ficar na mesma frequência do PIB e preenchendo a primeira observação do PIB com a mediana 

qdata_pca = (data_pca.groupby(pd.PeriodIndex(data_pca.index, freq = 'Q'), axis = 0)).mean()
# qdata_pca.to_csv('../../data/qdata_tfpca.csv')
new_data = pd.merge(qgdp, qdata_pca, how='inner', left_index=True, right_index=True)
new_data.gdp.fillna(new_data.gdp.median(), inplace = True)
new_data['gdp'] = new_data['gdp'] * 100
new_data.index = pd.date_range(start = '2002-03-01', end = '2022-09-01', freq = 'QS-DEC')
new_data

  data_pca.columns = data_pca.columns.str.replace(r"[C]", "F")


Unnamed: 0,gdp,F1,F2,F3,F4,F5
2002-03-01,0.388552,0.538712,0.492187,0.393648,0.491212,-0.599411
2002-06-01,4.320709,-0.442968,-0.049328,-0.098923,0.011377,-0.766521
2002-09-01,3.543887,0.424707,-0.647956,0.217876,-4.446846,-1.604402
2002-12-01,-0.217573,-0.232993,-2.268012,6.315600,-11.643441,-3.549546
2003-03-01,-4.746729,-1.958564,0.865307,3.212400,-5.975119,-1.915151
...,...,...,...,...,...,...
2021-09-01,2.578462,-0.342376,0.778500,2.328568,-1.987400,-0.206580
2021-12-01,-0.432211,1.693068,-3.285857,2.140440,-3.050982,1.034121
2022-03-01,-0.262737,-2.583732,2.756302,0.194961,-5.354888,-1.059385
2022-06-01,1.786737,-0.366600,-0.601419,0.377088,-1.336048,-1.747521


## Preparando o ambiente de variáveis

In [20]:
# CRIANDO DEFASAGENS DOS FATORES

# Lags Fator 1
new_data['F1_2'] = new_data['F1'].shift(2)
new_data['F1_4'] = new_data['F1'].shift(4)
new_data['F1_6'] = new_data['F1'].shift(6)


# Lags Fator 2
new_data['F2_2'] = new_data['F2'].shift(2)

# Lags Fator 3
new_data['F3_1'] = new_data['F3'].shift(1)
new_data['F3_2'] = new_data['F3'].shift(2)
new_data['F3_3'] = new_data['F3'].shift(3)

# Lags Fator 4

# Lags fator 5

# Lags GDP
new_data['gdp_2'] = new_data['gdp'].shift(2)
new_data['gdp_4'] = new_data['gdp'].shift(4)

# CRIANDO OS RESÍDUOS QUE SERÃO USADOS NA REGRESSÃO

# Modelo com 1 fator - e1

# Modelo com 5 fatores - e
list_5fac = list()

reg = smf.ols('gdp ~ gdp_2 + gdp_4 +'
              'F1 + F1_4 +'
              'F2 + F2_2 +'
              'F3 + F3_1 + F3_2 + F3_3 +'
              'F4 + '
              'F5', data = new_data.loc['2002Q1':'2017Q4',])
dfm_5fac = reg.fit(cov_type = 'HC1')
pred = dfm_5fac.predict(new_data.loc['2002Q1':'2018Q1',]).values[0]
list_5fac.append(pred)

for i, j in zip(new_data.loc['2018Q1':,].index, new_data.loc['2018Q2':,].index):
    reg = smf.ols('gdp ~ gdp_2 + gdp_4 +'
              'F1 + F1_4 +'
              'F2 + F2_2 +'
              'F3 + F3_1 + F3_2 + F3_3 +'
              'F4 + '
              'F5', data = new_data.loc['2002Q1':i,])
    dfm_5fac = reg.fit(cov_type = 'HC1')
    pred = dfm_5fac.predict(new_data.loc[j]).values[0]
    list_5fac.append(pred)

new_data['e_1'] = dfm_5fac.resid.shift(1)

In [23]:
# train and test dates

train_start_date = "2002-03-01"
train_end_date = "2017-12-01"
test_start_date = "2018-03-01"
test_end_date = "2022-09-01"

# train and test datasets

train = new_data.loc[(new_data.index >= train_start_date) & (new_data.index <= train_end_date), :]
test = new_data.loc[(new_data.index >= test_start_date) & (new_data.index <= test_end_date), :]

# Train dataset

X_train = train.drop('gdp', axis = 1)
y_train = train['gdp']

# Test dataset 

X_test = test.drop('gdp', axis = 1)
y_test = test['gdp']

# Modelos

## DFM: 1 fator

In [39]:
# Com dados da amostra - 2002 a 2017

reg = smf.ols('gdp ~ F1', data = train)
dfm1 = reg.fit()

# Modelo calibrado para prever resultados fora da amostra

# test_1fac = test[['F1']].copy()
list_1fac = list()

reg = smf.ols('gdp ~ F1', data = new_data.loc['2002Q1':'2017Q4',])
dfm_1fac = reg.fit()
pred = dfm_1fac.predict(test.loc['2018Q1']).values[0]
list_1fac.append(pred)

for i, j in zip(test.index, new_data.loc['2018Q2':,].index):
    reg = smf.ols('gdp ~ F1', data = new_data.loc['2002Q1':i,])
    dfm_1fac = reg.fit()
    pred = dfm_1fac.predict(test.loc[j]).values[0]
    list_1fac.append(pred)

## DFM: 1 fator e lags 

In [40]:
# Com dados da amostra - 2002 a 2017

reg = smf.ols('gdp ~ gdp_2 + gdp_4 + F1 + F1_2 + F1_4 + F1_6', data = train)
dfm1lags = reg.fit()

# Modelo calibrado para prever resultados fora da amostra

# test_1fac = test[['F1']].copy()
list_1fac_lags = list()

reg = smf.ols('gdp ~ gdp_2 + gdp_4 + F1 + F1_2 + F1_4 + F1_6', data = new_data.loc['2002Q1':'2017Q4',])
dfm_1fac_lags = reg.fit()
pred = dfm_1fac_lags.predict(test.loc['2018Q1']).values[0]
list_1fac_lags.append(pred)

for i, j in zip(test.index, new_data.loc['2018Q2':,].index):
    reg = smf.ols('gdp ~ gdp_2 + gdp_4 + F1 + F1_2 + F1_4 + F1_6', data = new_data.loc['2002Q1':i,])
    dfm_1fac_lags = reg.fit()
    pred = dfm_1fac_lags.predict(test.loc[j]).values[0]
    list_1fac_lags.append(pred)

## DFM: 5 fatores