# Construcción de features para el S&P 500

## Procedimientos

- Descargar datos:
    - Serie S&P 500
    - Serie para las compañias
    - Series para las monedas
    - Series para los indices
- Calcular retornos
- Calcular variable de clase
- Calcular desfase temporal del retorno
- Calcular indicadores técnicos
- Cargar indicadores económicos
- Guardar features y variable de clase

In [1]:
%pylab inline

import pickle as pk
import pandas as pd
from pandas_datareader import data
import fix_yahoo_finance

fix_yahoo_finance.pdr_override()

from datetime import datetime

from pyCBT.providers.scrappers import slickcharts, wikipedia
from pyCBT.common.files import exist
from talib import RSI, CCI, ROC, MOM, WILLR

Populating the interactive namespace from numpy and matplotlib


In [2]:
sp500_slick = slickcharts.get_sp500_metadata()
sp500_slick.set_index(keys="Symbol", inplace=True)
sp500_slick.sort_index(inplace=True)

sp500_wiki = wikipedia.get_sp500_metadata()
sp500_wiki.set_index(keys="Symbol", inplace=True)
sp500_wiki.sort_index(inplace=True)
sp500_wiki.drop(columns="Company", inplace=True)

sp500_md = pd.concat((sp500_wiki, sp500_slick), axis=1)
sp500_md.sort_values(by="Weight")

Unnamed: 0_level_0,Sector,Company,Weight
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,Information Technology,Apple Inc.,1.0
MSFT,Information Technology,Microsoft Corporation,2.0
AMZN,Consumer Discretionary,Amazon.com Inc.,3.0
FB,Information Technology,Facebook Inc. Class A,4.0
BRK.B,Financials,Berkshire Hathaway Inc. Class B,5.0
JPM,Financials,JPMorgan Chase & Co.,6.0
JNJ,Health Care,Johnson & Johnson,7.0
XOM,Energy,Exxon Mobil Corporation,8.0
GOOG,Information Technology,Alphabet Inc. Class C,9.0
GOOGL,Information Technology,Alphabet Inc. Class A,10.0


In [3]:
# definir fechas
from_date, to_date = datetime(2005, 1, 1), datetime.today().date()
# descargar datos
# TODO: use alpha-vantage or OANDA instead
indices_yahoo = {
    "DJI": "^DJI",
    "HSI": "^HSI",
    "FCHI": "^FCHI",
    "NYA": "^NYA",
    "VIX": "^VIX"
}
# TODO: use alpha-vantage or OANDA instead
indices_stooq = {
    "DXY": "USD_I",
    "FTSE": "X.F",
    "DAX": "^DAX",
    "DJA": "DIA.US",
    "NASDAQ": "^NDX",
    "N225": "^NKX",
    "SSE": "^SHC",
    "VXN": "R3.C",
    "VXO": "VI.F"
}
# TODO: use alpha-vantage instead
stocks_stooq = {
    "AAPL": "AAPL.US",
    "MSFT": "MSFT.US",
    "AMZN": "AMZN.US",
    "FB": "FB.US",
    "JPM": "JPM.US",
    "BRK.B": "BRK-B.US",
    "JNJ": "JNJ.US",
    "GOOG": "GOOG.US",
    "XOM": "XOM.US"
}
# TODO: use OANDA instead
currencies_yahoo = {
    "USDCAD": "CAD=X",
    "GBPUSD": "GBPUSD=X",
    "USDCNY": "CNY=X",
    "USDJPY": "JPY=X",
    "EURUSD": "EURUSD=X"
}
# TODO: guardar datos ordenados
if exist("../data/pickles/filter-sp500-datasets.p"):
    sp500, major_stocks, major_indices, major_currencies = pk.load(open("../data/pickles/filter-sp500-datasets.p", "rb"))
else:
    sp500 = data.get_data_stooq("^SPX")

    major_stocks, major_indices, major_currencies = {}, {}, {}
    
    for ticker in indices_yahoo:
        major_indices[ticker] = data.get_data_yahoo(indices_yahoo[ticker], from_date, to_date)
    for ticker in indices_stooq:
        major_indices[ticker] = data.get_data_stooq(indices_stooq[ticker])
    for ticker in stocks_stooq:
        major_stocks[ticker] = data.get_data_stooq(stocks_stooq[ticker])
    for ticker in currencies_yahoo:
        major_currencies[ticker] = data.get_data_yahoo(currencies_yahoo[ticker], from_date, to_date)
        
    pk.dump((sp500, major_stocks, major_indices, major_currencies), open("../data/pickles/filter-sp500-datasets.p", "wb"))

In [4]:
# construir tabla master
# TODO: cambiar shift -1 por +1 cuando los datos ya estén ordenados arriba
master_table = pd.DataFrame(index=sp500.index, data=None)
for d in [major_stocks, major_indices, major_currencies]:
    for ticker in sorted(d):
        master_table[ticker] = d[ticker].Close.shift(-1)
# ordenar en ascendente
master_table.sort_index(inplace=True)
sp500.sort_index(inplace=True)
# filtrar NANs
master_table = master_table.dropna(how="all", axis=0)
sp500 = sp500.filter(items=master_table.index, axis=0)
# calcular retornos
Y_return = sp500.Close.pct_change()
X_return = master_table.pct_change()
# calcular indicadores técnicos: rsi, williams_r, cci_symbol, roc, mom
X_techin = pd.DataFrame(index=sp500.index, data=None)
X_techin["RSI"] = RSI(sp500.Close.shift(1), 5)
X_techin["%R"] = WILLR(sp500.High.shift(1), sp500.Low.shift(1), sp500.Close.shift(1), 5)
X_techin["CCI"] = CCI(sp500.High.shift(1), sp500.Low.shift(1), sp500.Close.shift(1), 5)
X_techin["ROC"] = ROC(sp500.Close.shift(1), 5)
X_techin["MOM"] = MOM(sp500.Close.shift(1), 5)
# concatenar retornos + indicadores técnicos
# TODO: ordenar columnas por tipo de indicador
X_raw = pd.concat((X_return, X_techin), axis="columns")
Y_raw = pd.DataFrame(
    index=sp500.index,
    data={
        "Price": sp500.Close,
        "Return": Y_return,
        "Class": (Y_return>0.0).astype(int),
    }
)

In [5]:
# guardar tablas crudas
X_raw.reset_index().to_csv("../data/processed/sp500-ml/features-raw.csv", index=False)
Y_raw.reset_index().to_csv("../data/processed/sp500-ml/target-raw.csv", index=False)

# guardar tablas de los últimos 20 años
X_short = X_raw.dropna(how="all", axis=0)
X_short = X_short[X_short.index>"2018-01-01"]
Y_short = Y_raw.filter(items=X_short.index)
X_short.reset_index().to_csv("../data/processed/sp500-ml/features.csv", index=False)
Y_short.reset_index().to_csv("../data/processed/sp500-ml/target.csv", index=False)