In [1]:
# Librerías básicas
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns
sns.set()

# El módulo Statsmodels

Statsmodels es un módulo de Python que ofrece clases y funciones de varios modelos estadísticos, así como pruebas de hipótesis 
y análisis exploratorio de datos. La documentación se encuentra en <a href='https://www.statsmodels.org/stable/index.html'>statsmodels.org</a>. 

In [3]:
import statsmodels.graphics.tsaplots as sgt 
import statsmodels.tsa.stattools as sts 
from statsmodels.tsa.seasonal import seasonal_decompose

import statsmodels.graphics.tsaplots as sgt
from statsmodels.graphics.tsaplots import plot_predict
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.stats.distributions import chi2
import statsmodels.tsa.stattools as sts 

# Fuentes de datos interesantes

## API de Yahoo finance

Yahoo Finance tiene una API que permite descargar información financiera para realizar análisis. La página web original de la documentación ya no existe. Sin embargo, existen algunas páginas de documentación no oficial como <a href= 'https://github.com/mxbi/yahoo-finance-api.git'> este repositorio en GitHub, </a> o <a href='https://python-yahoofinance.readthedocs.io/en/latest/api.html'> esta página de documentación the ReadTheDocs </a>.

In [6]:
# Instalación de yfinance
%pip install yfinance

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Se importa la librería yfinance
import yfinance

In [8]:
# Se descargan los datos S&P500 y Nikkei225
df_yfinance_raw = yfinance.download(tickers = "^GSPC ^N225", #Las series de tiempo de interés - (en nuestro caso, S&P500 y NIKKEI225)
                              start = "1994-01-07", #Fecha inicial
                              end = "2024-08-27", #Fecha final
                              interval = "1d", #Frecuencia.
                              group_by = 'ticker', #Criterio de agrupación. Usualmente "ticker"
                              auto_adjust = True#,  
                              #threads = True
                               ) #. 

[*********************100%***********************]  2 of 2 completed


In [9]:
df_yfinance_raw.head()

Ticker,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC,^N225,^N225,^N225,^N225,^N225
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1994-01-07,467.089996,470.26001,467.029999,469.899994,324920000.0,17842.980469,18131.410156,17787.480469,18124.009766,0.0
1994-01-10,469.899994,475.269989,469.549988,475.269989,319490000.0,18186.519531,18567.060547,18186.519531,18443.439453,0.0
1994-01-11,475.269989,475.279999,473.269989,474.130005,305490000.0,18481.849609,18671.669922,18373.039062,18485.25,0.0
1994-01-12,474.130005,475.059998,472.140015,474.170013,310690000.0,18447.339844,18807.080078,18301.929688,18793.880859,0.0
1994-01-13,474.170013,474.170013,471.799988,472.470001,277970000.0,18770.380859,18823.380859,18548.75,18577.259766,0.0


In [10]:
# Se crea una copia para modificar
df_yfinance = df_yfinance_raw.copy()

In [11]:
# Se agregan columnas para el nivel de cierre de los índices S&P500 y Nikkei255
df_yfinance['spx'] = df_yfinance['^GSPC'].Close
df_yfinance['nikkei'] = df_yfinance['^N225'].Close

In [12]:
#f_comp = df_comp.iloc[1:] # Removing the first elements, since we always start 1 period before the first, due to time zone differences of closing prices
del df_yfinance['^N225'] # Se retiran los grupos de columnas '^N225' y '^GSPC'
del df_yfinance['^GSPC']
df_yfinance=df_yfinance.asfreq('b') # Se establece la frecuencia de los datos
df_yfinance=df_yfinance.fillna(method='ffill') # Se rellenan los datos faltantes con el método ffill que usa el último valor válido

In [13]:
df_yfinance.head()

Ticker,spx,nikkei
Price,Unnamed: 1_level_1,Unnamed: 2_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2
1994-01-07,469.899994,18124.009766
1994-01-10,475.269989,18443.439453
1994-01-11,474.130005,18485.25
1994-01-12,474.170013,18793.880859
1994-01-13,472.470001,18577.259766


In [14]:
df_yfinance.tail()

Ticker,spx,nikkei
Price,Unnamed: 1_level_1,Unnamed: 2_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2
2024-08-20,5597.120117,38062.921875
2024-08-21,5620.850098,37951.800781
2024-08-22,5570.640137,38211.011719
2024-08-23,5634.609863,38364.269531
2024-08-26,5616.839844,38110.21875


In [15]:
df_yfinance.columns

MultiIndex([(   'spx', ''),
            ('nikkei', '')],
           names=['Ticker', 'Price'])

In [16]:
df_yfinance.columns=['spx','nikkei']

In [17]:
df_yfinance.columns

Index(['spx', 'nikkei'], dtype='object')

In [18]:
df_yfinance.head() 

Unnamed: 0_level_0,spx,nikkei
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1994-01-07,469.899994,18124.009766
1994-01-10,475.269989,18443.439453
1994-01-11,474.130005,18485.25
1994-01-12,474.170013,18793.880859
1994-01-13,472.470001,18577.259766


In [19]:
df_yfinance.columns

Index(['spx', 'nikkei'], dtype='object')

In [20]:
df_yfinance.index = pd.to_datetime(df_yfinance.index)

In [21]:
df_yfinance.tail() # Making sure of the last day we're including in the series

Unnamed: 0_level_0,spx,nikkei
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-08-20,5597.120117,38062.921875
2024-08-21,5620.850098,37951.800781
2024-08-22,5570.640137,38211.011719
2024-08-23,5634.609863,38364.269531
2024-08-26,5616.839844,38110.21875


In [22]:
#df_comp.date = pd.to_datetime(df_comp.date, dayfirst = True)
#df_comp.set_index("date", inplace=True)
df_yfinance=df_yfinance.asfreq('b')
df_yfinance=df_yfinance.fillna(method='ffill')

In [23]:
df_yfinance.head()

Unnamed: 0_level_0,spx,nikkei
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1994-01-07,469.899994,18124.009766
1994-01-10,475.269989,18443.439453
1994-01-11,474.130005,18485.25
1994-01-12,474.170013,18793.880859
1994-01-13,472.470001,18577.259766


In [24]:
df_yfinance.index = pd.to_datetime(df_yfinance.index).tz_localize(None)

## Algunos archivos csv con datos

En <a href='https://github.com/jbrownlee/Datasets.git'>este repositorio de GitHub</a> se encuentran muchos archivos .csv con datos apropiados para ejercicios de aprendizaje de máquina y series de tiempo. En este notebook utilizaremos el archivo 'airline_passengers.csv'.

In [26]:
df_airline = pd.read_csv('airline_passengers.csv',index_col='Month',parse_dates=True)

In [27]:
df_airline.head()

Unnamed: 0_level_0,Passengers
Month,Unnamed: 1_level_1
1949-01-01,112
1949-02-01,118
1949-03-01,132
1949-04-01,129
1949-05-01,121


In [28]:
df_airline.index = pd.to_datetime(df_airline.index)

## Datos que vienen con la librería statsmodels
La librería <a href='https://www.statsmodels.org/stable/index.html'>statsmodels</a> tiene varios conjuntos de datos incorporados.
En este notebook, utilizaremos el dataset 'macrodata', que trae datos macroeconómicos de EEUU.

In [30]:
#import pandas as pd
import statsmodels.api as sm
df_macrodata = sm.datasets.macrodata.load_pandas().data
df_macrodata.index = pd.Index(sm.tsa.datetools.dates_from_range('1959Q1', '2009Q3'))
print(sm.datasets.macrodata.NOTE)

::
    Number of Observations - 203

    Number of Variables - 14

    Variable name definitions::

        year      - 1959q1 - 2009q3
        quarter   - 1-4
        realgdp   - Real gross domestic product (Bil. of chained 2005 US$,
                    seasonally adjusted annual rate)
        realcons  - Real personal consumption expenditures (Bil. of chained
                    2005 US$, seasonally adjusted annual rate)
        realinv   - Real gross private domestic investment (Bil. of chained
                    2005 US$, seasonally adjusted annual rate)
        realgovt  - Real federal consumption expenditures & gross investment
                    (Bil. of chained 2005 US$, seasonally adjusted annual rate)
        realdpi   - Real private disposable income (Bil. of chained 2005
                    US$, seasonally adjusted annual rate)
        cpi       - End of the quarter consumer price index for all urban
                    consumers: all items (1982-84 = 100, seasonally adju

In [31]:
df_macrodata.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
1959-03-31,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1959-06-30,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
1959-09-30,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
1959-12-31,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
1960-03-31,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [32]:
df_macrodata.index = pd.to_datetime(df_macrodata.index)

## Datos de simulados de ventas

In [34]:
df_ventas = pd.read_excel('Datos_clase_1.xlsx',index_col = 'Fecha', parse_dates = True)

In [35]:
df_ventas.index = pd.to_datetime(df_ventas.index)

# S&P 500

In [37]:
#%conda install -c conda-forge fbprophet -y

In [38]:
%pip install prophet

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [39]:
import prophet

In [40]:
from prophet import Prophet

## Formato de los datos

In [42]:
# Se deja sólo S&P 500
del df_yfinance['nikkei']

In [43]:
df_yfinance = df_yfinance.reset_index()

In [44]:
df_yfinance.columns = ['ds','y']

In [45]:
df_yfinance.head()

Unnamed: 0,ds,y
0,1994-01-07,469.899994
1,1994-01-10,475.269989
2,1994-01-11,474.130005
3,1994-01-12,474.170013
4,1994-01-13,472.470001


In [46]:
df_yfinance['ds'] = pd.to_datetime(df_yfinance['ds'])

## Crear y ajustar el modelo

In [48]:
import cmdstanpy
cmdstanpy.install_cmdstan()
cmdstanpy.install_cmdstan(compiler=True)

AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
m_yfinance = Prophet()

In [None]:
m_yfinance.fit(df_yfinance)

## Pronósticos

### Creación de dataframe para los pronósticos

In [None]:
df_future_yfinance = m_yfinance.make_future_dataframe(periods=360)

In [None]:
df_future_yfinance.tail()

### Predicción de datos futuros

In [None]:
forecast_yfinance = m_yfinance.predict(df_future_yfinance)

In [None]:
forecast_yfinance.head()

In [None]:
len(forecast_yfinance)

### Gráfico de prónósticos

In [None]:
m_yfinance.plot(forecast_yfinance)

In [None]:
forecast_yfinance.plot(x='ds',y='yhat')

In [None]:
m_yfinance.plot_components(forecast_yfinance);

# Evaluación de los pronósticos

## División de los datos en datos de entrenamiento y prueba

In [None]:
len(df_yfinance)

In [None]:
n=len(df_yfinance)-365

In [None]:
n

In [None]:
df_yfinance_prophet_train = df_yfinance[:n]

In [None]:
df_yfinance_prophet_test = df_yfinance[n:]

In [None]:
m_yfinance_evaluation = Prophet()
m_yfinance_evaluation.fit(df_yfinance_prophet_train)
future_yfinance_evaluation = m_yfinance_evaluation.make_future_dataframe(periods=365)
forecast_yfinance_evaluation = m_yfinance_evaluation.predict(future_yfinance_evaluation)

In [None]:
ax = forecast_yfinance_evaluation.plot(x='ds',y='yhat',label='Predictions',legend=True,figsize=(12,8))

df_yfinance_prophet_test.plot(x='ds',y='y',label='True S&P500',legend=True,ax=ax,xlim=('2023-04-01','2024-04-01'))

In [None]:
from statsmodels.tools.eval_measures import rmse

In [None]:
predictions_yfinance_evaluation = forecast_yfinance_evaluation.iloc[7627:]['yhat']

In [None]:
predictions_yfinance_evaluation.head()

In [None]:
len(predictions_yfinance_evaluation)

In [None]:
df_yfinance_prophet_test['y'].head()

In [None]:
rmse(predictions_yfinance_evaluation,df_yfinance_prophet_test['y'])

## Validación cruzada con prophet

In [None]:
from prophet.diagnostics import cross_validation, performance_metrics
from prophet.plot import plot_cross_validation_metric

In [None]:
len(df_yfinance)

In [None]:
n=len(df_yfinance)/12

In [None]:
# Initial 5 years training period
initial = 5 * 365
initial = str(initial) + ' days'
# Fold every 5 years
period = 5 * 365
period = str(period) + ' days'
# Forecast 1 year into the future
horizon = 365
horizon = str(horizon) + ' days'

In [None]:
df_yfinance_cv = cross_validation(m_yfinance, initial=initial, period=period, horizon = horizon)

In [None]:
performance_metrics(df_yfinance_cv)

In [None]:
plot_cross_validation_metric(df_yfinance_cv, metric='rmse');

In [None]:
plot_cross_validation_metric(df_yfinance_cv, metric='mape');

## Cambios de tendencia con Prophet

In [None]:
from prophet.plot import add_changepoints_to_plot

In [None]:
fig = m_yfinance.plot(forecast_yfinance)
a = add_changepoints_to_plot(fig.gca(), m_yfinance, forecast_yfinance)

## Estacionalidad multiplicativa

In [None]:
m_yfinance_multiplicative = Prophet(seasonality_mode='multiplicative')
m_yfinance_multiplicative.fit(df_yfinance)
forecast_yfinance_multiplicative = m_yfinance_multiplicative.predict(df_future_yfinance)
fig = m_yfinance_multiplicative.plot(forecast_yfinance_multiplicative)

In [None]:
fig = m_yfinance_multiplicative.plot_components(forecast_yfinance_multiplicative)

In [None]:
fig = m_yfinance_multiplicative.plot(forecast_yfinance_multiplicative)
a = add_changepoints_to_plot(fig.gca(), m_yfinance_multiplicative, forecast_yfinance_multiplicative)