In [95]:

import pandas as pd
import os
from src.__special__ import indices_path
import numpy as np

file_path = os.path.join(indices_path, 'INDEXP.csv')

INDEXP = pd.read_csv(file_path)

In [104]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Seleccionar todas las columnas menos la primera
INDEXP_pca = INDEXP.iloc[:, 1:].dropna()  # Esto selecciona todas las columnas excepto la primera y elimina NaN

# Estandarizar datos (equivalente a matriz de correlación en PCA)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(INDEXP_pca)



In [110]:
# Calcular media y desviación estándar ignorando NaN
mean = np.nanmean(INDEXP_pca, axis=0)
std = np.nanstd(INDEXP_pca, axis=0)

# Evitar divisiones por cero en columnas constantes
std[std == 0] = 1

# Aplicar estandarización sin afectar los NaN
data_scaled = (INDEXP_pca - mean) / std
data_scaled.shape
INDEXP_pca.shape

(250, 36)

In [106]:
# Aplicar PCA
pca = PCA(n_components=5)
pca_result = pca.fit_transform(data_scaled)

# Convertir resultados en DataFrame
pca_df = pd.DataFrame(pca_result, index=INDEXP_pca.index, columns=[f'PC{i+1}' for i in range(5)])
pca_result

array([[-0.14082494, -0.3680257 , -0.31434945,  0.15697813, -0.46486961],
       [-0.14488085,  0.0613828 , -0.01260377, -0.54761979, -0.41428865],
       [-0.15305285, -0.00697161, -0.03336654,  0.11590285, -0.63478586],
       ...,
       [ 0.61610091, -1.04707833, -1.29761821,  0.00662211,  0.0935687 ],
       [ 0.5420275 , -0.65076322, -1.10343849, -0.30345287,  0.05508169],
       [ 0.54108834, -0.51502838, -1.13308383, -0.09672702,  0.04186314]],
      shape=(250, 5))

In [97]:
explained_variance = pca.explained_variance_ratio_
# 6. Ver la varianza explicada por cada componente


fact_load = pca.components_.T
fact_load

array([[ 1.20797534e-01,  2.61695834e-01,  1.75127618e-01,
        -2.71865419e-01, -2.51007691e-01],
       [ 1.88353376e-01,  1.81101179e-02, -1.63117790e-01,
         1.12744201e-01, -5.42929067e-02],
       [ 1.74244095e-01,  1.56856686e-01, -2.13879264e-01,
        -2.44901417e-02, -6.42558658e-02],
       [ 1.79010049e-01,  1.25593899e-01, -2.10182675e-01,
        -6.28275327e-02, -6.16765603e-02],
       [ 1.42444978e-01,  1.74117478e-01, -3.43515918e-02,
        -2.80596606e-01,  4.81416970e-01],
       [ 1.81703720e-01,  1.04288601e-01, -1.96223744e-01,
        -4.01040610e-02, -3.62180234e-02],
       [ 1.59469006e-01,  2.05319344e-01, -2.41102386e-01,
        -9.66388025e-04, -4.98192135e-02],
       [ 1.69866499e-01,  1.69373398e-01, -2.33973319e-01,
        -1.54363788e-02, -5.72737072e-02],
       [ 1.73798826e-01,  1.53354588e-01, -2.27904718e-01,
        -1.25278424e-02, -4.89508738e-02],
       [ 1.81204796e-01,  1.55678797e-01,  9.05968210e-03,
        -1.92433507e-01

In [98]:
original_values = pd.read_csv(os.path.join(os.path.dirname(indices_path), "variables_originales_exp.csv"))


# Lista de columnas exp1 a exp12
exp_cols = [f'exp{i}' for i in range(1, 13)]

# Calcular la diferencia logarítmica de 12 períodos para cada variable exp1 a exp12
for col in exp_cols:
    original_values[f'dlog_{col}'] = np.log(original_values[col]) - np.log(original_values[col].shift(12))
# Calcular la diferencia logarítmica con un rezago de 1 período
for col in exp_cols:
    original_values[f'dlog_{col}_lag1'] = np.log(original_values[col].shift(1)) - np.log(original_values[col].shift(13))

# Calcular la diferencia logarítmica con un rezago de 2 períodos
for col in exp_cols:
    original_values[f'dlog_{col}_lag2'] = np.log(original_values[col].shift(2)) - np.log(original_values[col].shift(14))

# Mostrar los primeros valores
original_values
date = original_values['_date_']

In [99]:
or_values = original_values.to_numpy()

or_values = or_values[:, 13:]
print(or_values.shape)


(384, 36)


In [100]:
print(np.shape(or_values))
print(np.shape(fact_load))



y = np.dot(or_values, fact_load)
# print(y.shape)

#concatenate the date as first column with the data
y = np.concatenate((date.to_numpy().reshape(-1, 1), y), axis=1)

y[122, 1]

(384, 36)
(36, 5)


0.19712870793259918

In [101]:
FEXPeviews = pd.read_csv("FEXPeviews.txt", delimiter="\t")

FEXPeviews

Unnamed: 0,_date_,fexp1f,fexp2f,fexp3f,fexp4f,fexp5f
0,1995-01-01,,,,,
1,1995-02-01,,,,,
2,1995-03-01,,,,,
3,1995-04-01,,,,,
4,1995-05-01,,,,,
...,...,...,...,...,...,...
379,2026-08-01,,,,,
380,2026-09-01,,,,,
381,2026-10-01,,,,,
382,2026-11-01,,,,,


In [102]:
pca.transform(data_scaled)

array([[-0.14082494, -0.3680257 , -0.31434945,  0.15697813, -0.46486961],
       [-0.14488085,  0.0613828 , -0.01260377, -0.54761979, -0.41428865],
       [-0.15305285, -0.00697161, -0.03336654,  0.11590285, -0.63478586],
       ...,
       [ 0.61610091, -1.04707833, -1.29761821,  0.00662211,  0.0935687 ],
       [ 0.5420275 , -0.65076322, -1.10343849, -0.30345287,  0.05508169],
       [ 0.54108834, -0.51502838, -1.13308383, -0.09672702,  0.04186314]],
      shape=(250, 5))

In [103]:
data_scaled

array([[-0.5290154 , -0.06744629, -0.07143506, ...,  0.26274968,
        -0.48960077,  0.3180935 ],
       [ 0.29497762, -0.09774818, -0.25652948, ..., -0.07126818,
        -0.21697279,  0.20137117],
       [-0.0014292 , -0.01190444, -0.12771987, ...,  0.37203346,
        -0.90035328,  0.29712309],
       ...,
       [-0.64576055,  0.10142477,  0.25858709, ..., -0.03417464,
        -0.31709749, -0.28585641],
       [-0.20844046,  0.17514208,  0.3428313 , ..., -0.06127333,
        -0.13240987, -0.41974433],
       [-0.23888979,  0.13511414,  0.26600878, ...,  0.030562  ,
        -0.47401613, -0.40601399]], shape=(250, 36))