# Construcción de features para el S&P 500

## Procedimientos

- Cargar los datos para los indicadores económicos
- Descargar datos:
    - Serie S&P 500
    - Series para los sectores del S&P 500
    - Series para las compañias
    - Series para las monedas
    - Series para los indices
- Calcular retornos
- Calcular variable de clase
- Calcular desfase temporal del retorno
- Calcular indicadores técnicos
- Cargar indicadores económicos
- Guardar features y variable de clase

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pickle as pk
import pandas as pd
from pandas_datareader import data
import fix_yahoo_finance

fix_yahoo_finance.pdr_override()

from datetime import datetime

from pyCBT.providers.scrappers import slickcharts, wikipedia
from pyCBT.common.files import exist
from talib import RSI, CCI, ROC, MOM, WILLR

from pyCBT.providers.oanda import account, historical

oanda_client = account.Client()

In [3]:
sp500_slick = slickcharts.get_sp500_metadata()
sp500_slick.set_index(keys="Symbol", inplace=True)
sp500_slick.sort_index(inplace=True)

sp500_wiki = wikipedia.get_sp500_metadata()
sp500_wiki.set_index(keys="Symbol", inplace=True)
sp500_wiki.sort_index(inplace=True)
sp500_wiki.drop(columns="Company", inplace=True)

sp500_md = pd.concat((sp500_wiki, sp500_slick), axis=1)
sp500_md.sort_values(by="Weight")

Unnamed: 0_level_0,Sector,Company,Weight
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,Information Technology,Apple Inc.,1.0
MSFT,Information Technology,Microsoft Corporation,2.0
AMZN,Consumer Discretionary,Amazon.com Inc.,3.0
FB,Information Technology,Facebook Inc. Class A,4.0
BRK.B,Financials,Berkshire Hathaway Inc. Class B,5.0
JPM,Financials,JPMorgan Chase & Co.,6.0
JNJ,Health Care,Johnson & Johnson,7.0
XOM,Energy,Exxon Mobil Corporation,8.0
GOOG,Information Technology,Alphabet Inc. Class C,9.0
GOOGL,Information Technology,Alphabet Inc. Class A,10.0


In [4]:
# definir fechas
from_date, to_date = datetime(2005, 1, 1), datetime.today().date()
# descargar datos
# TODO: use alpha-vantage or OANDA instead
sectors_stooq = {
    "Consumer disc.": "BI.F",
    "Consumer stap.": "BL.F",
    "Energy": "BM.F",
    "Financials": "BN.F",
    "Health care": "BS.F",
    "Industrials": "JG.F",
    "Inf. tech.": "JB.F",
    "Materials": "JA.F",
    "Utilities": "JE.F",
    "Real state": "BK.F"
}
indices_yahoo = {
    "DJI": "^DJI",
    "HSI": "^HSI",
    "FCHI": "^FCHI",
    "NYA": "^NYA",
    "VIX": "^VIX"
}
# TODO: use alpha-vantage or OANDA instead
indices_stooq = {
    "DXY": "USD_I",
    "FTSE": "X.F",
    "DAX": "^DAX",
    "DJA": "DIA.US",
    "NASDAQ": "^NDX",
    "N225": "^NKX",
    "SSE": "^SHC",
    "VXN": "R3.C",
    "VXO": "VI.F"
}
# TODO: use alpha-vantage instead
stocks_stooq = {
    "AAPL": "AAPL.US",
    "MSFT": "MSFT.US",
    "AMZN": "AMZN.US",
    "FB": "FB.US",
    "JPM": "JPM.US",
    "BRK-B": "BRK-B.US",
    "JNJ": "JNJ.US",
    "GOOG": "GOOG.US",
    "XOM": "XOM.US"
}
# TODO: use OANDA instead
currencies_yahoo = {
    "USDCAD": "CAD=X",
    "GBPUSD": "GBPUSD=X",
    "USDCNY": "CNY=X",
    "USDJPY": "JPY=X",
    "EURUSD": "EURUSD=X"
}
cfds_oanda = {
    "Crude Oil": "WTICO_USD",
    "Gold": "XAU_USD"
}
if False:#exist("../data/pickles/filter-sp500-datasets.p"):
    sp500, sp500_sectors, major_stocks, major_indices, major_currencies, major_cfds, major_econ = pk.load(open("../data/pickles/filter-sp500-datasets.p", "rb"))
else:
    major_econ = pd.read_csv("../data/processed/filter-analysis/economic-indicators-raw.csv", index_col="Date").sort_index()
    
    sp500 = data.get_data_stooq("^SPX").sort_index()

    sp500_sectors, major_stocks, major_indices, major_currencies, major_cfds = {}, {}, {}, {}, {}
    for symbol in indices_yahoo:
        major_indices[symbol] = data.get_data_yahoo(indices_yahoo[symbol], from_date, to_date).sort_index()
    for symbol in indices_stooq:
        major_indices[symbol] = data.get_data_stooq(indices_stooq[symbol]).sort_index()
    for symbol in stocks_stooq:
        major_stocks[symbol] = data.get_data_stooq(stocks_stooq[symbol]).sort_index()
    for symbol in currencies_yahoo:
        major_currencies[symbol] = data.get_data_yahoo(currencies_yahoo[symbol], from_date, to_date).sort_index()
    for symbol in sectors_stooq:
        sp500_sectors[symbol] = data.get_data_stooq(sectors_stooq[symbol]).sort_index()
    for symbol in cfds_oanda:
        candles = historical.Candles(
            client=oanda_client,
            instrument=cfds_oanda[symbol],
            resolution="D",
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            datetime_fmt="%Y-%m-%d",
            timezone="America/New_York"
        )
        major_cfds[symbol] = candles.as_dataframe()
        major_cfds[symbol].index.name = "Date"
        major_cfds[symbol].index = pd.to_datetime(major_cfds[symbol].index)
        
    pk.dump((sp500, sp500_sectors, major_stocks, major_indices, major_currencies, major_cfds, major_econ), open("../data/pickles/filter-sp500-datasets.p", "wb"))

KeyError: "name='B', domain=None, path=None"

In [None]:
# indices
X_indices = pd.DataFrame(index=sp500.index, data=None)
for symbol, df in major_indices.iteritems(): X_indices[symbol] = df.Close.shift()
# sectors
X_sectors = pd.DataFrame(index=sp500.index, data=None)
for symbol, df in sp500_sectors.iteritems(): X_sectors[symbol] = df.Close.shift()
# stocks
X_stocks = pd.DataFrame(index=sp500.index, data=None)
for symbol, df in major_stocks.iteritems(): X_stocks[symbol] = df.Close.shift()
# CFDs
X_cfds = pd.DataFrame(index=sp500.index, data=None)
for symbol, df in major_cfds.iteritems(): X_cfds[symbol] = df.Close.shift()
# currencies
X_currencies = pd.DataFrame(index=sp500.index, data=None)
for symbol, df in major_currencies.iteritems(): X_currencies[symbol] = df.Close.shift()
# technical indicators
X_technical = pd.DataFrame(index=sp500.index, data=None)
X_technical["RSI"] = RSI(sp500.Close.shift(), 5)
X_technical["%R"] = WILLR(sp500.High.shift(), sp500.Low.shift(), sp500.Close.shift(), 5)
X_technical["CCI"] = CCI(sp500.High.shift(), sp500.Low.shift(), sp500.Close.shift(), 5)
X_technical["ROC"] = ROC(sp500.Close.shift(), 5)
X_technical["MOM"] = MOM(sp500.Close.shift(), 5)
# economical
X_economical = major_econ.copy()
# master
# TODO: ordenar columnas por tipo de indicador
X_master = pd.concat(
    objs=(
        X_indices.pct_change(),
        X_sectors.pct_change(),
        X_stocks.pct_change(),
        X_cfds.pct_change(),
        X_currencies.pct_change(),
        X_technical,
        X_economical
    ),
    axis="columns"
)
X_master.insert(loc=0, column="S&P 500", value=sp500.Close.shift().pct_change())
# filtrar NANs
X_master.dropna(how="all", axis="index", inplace=True)
sp500 = sp500.filter(items=X_master.index, axis="index")
Y_raw = pd.DataFrame(
    index=sp500.index,
    data={
        "S&P 500 Price": sp500.Close,
        "S&P 500 Return": sp500.Close.pct_change(),
        "Class": (sp500.Close.pct_change() >= 0.0).astype(int),
    }
)

In [None]:
X_raw.tail()

In [None]:
Y_raw.tail()

In [None]:
# guardar tablas crudas
X_raw.reset_index().to_csv("../data/processed/sp500-ml/features-raw.csv", index=False)
Y_raw.reset_index().to_csv("../data/processed/sp500-ml/target-raw.csv", index=False)

# guardar tablas de los últimos ~20 años
X_short = X_raw.dropna(how="all", axis=0)
X_short = X_short[X_short.index>"1998-01-01"]
Y_short = Y_raw.filter(items=X_short.index)
X_short.reset_index().to_csv("../data/processed/sp500-ml/features.csv", index=False)
Y_short.reset_index().to_csv("../data/processed/sp500-ml/target.csv", index=False)