# Construcción de features para el S&P 500

## Procedimientos

- Cargar los datos para los indicadores económicos
- Descargar datos:
    - Serie S&P 500
    - Series para los sectores del S&P 500
    - Series para las compañias
    - Series para las monedas
    - Series para los indices
- Calcular retornos
- Calcular variable de clase
- Calcular desfase temporal del retorno
- Calcular indicadores técnicos
- Cargar indicadores económicos
- Guardar features y variable de clase

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import itertools as it
import pickle as pk
import pandas as pd
from pandas_datareader import data
import fix_yahoo_finance

fix_yahoo_finance.pdr_override()

from datetime import datetime

from pyCBT.providers.scrappers import slickcharts, wikipedia
from pyCBT.common.files import exist
from talib import RSI, CCI, ROC, MOM, WILLR

from pyCBT.providers.oanda import account, historical

oanda_client = account.Client()

from alpha_vantage.timeseries import TimeSeries

avantage_client = TimeSeries(key="RL0YBD0NPUE3QYQ6", output_format="pandas")
col_mapping = {
            "1. open": "Open",
            "2. high": "High",
            "3. low": "Low",
            "4. close": "Close",
            "5. volume": "Volume"
        }

In [3]:
sp500_slick = slickcharts.get_sp500_metadata()
sp500_slick.set_index(keys="Symbol", inplace=True)
sp500_slick.sort_index(inplace=True)

sp500_wiki = wikipedia.get_sp500_metadata()
sp500_wiki.set_index(keys="Symbol", inplace=True)
sp500_wiki.sort_index(inplace=True)
sp500_wiki.drop(columns="Company", inplace=True)

sp500_md = pd.concat((sp500_wiki, sp500_slick), axis=1)
sp500_md.sort_values(by="Weight")

Unnamed: 0_level_0,Sector,Company,Weight
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,Information Technology,Apple Inc.,1.0
MSFT,Information Technology,Microsoft Corporation,2.0
AMZN,Consumer Discretionary,Amazon.com Inc.,3.0
FB,Information Technology,Facebook Inc. Class A,4.0
BRK.B,Financials,Berkshire Hathaway Inc. Class B,5.0
JPM,Financials,JPMorgan Chase & Co.,6.0
JNJ,Health Care,Johnson & Johnson,7.0
XOM,Energy,Exxon Mobil Corporation,8.0
GOOG,Information Technology,Alphabet Inc. Class C,9.0
GOOGL,Information Technology,Alphabet Inc. Class A,10.0


In [4]:
# definir fechas
from_date, to_date = datetime(1995, 1, 1), datetime.today().date()
# definir tickers y símbolos
indices_avantage = {
    "DJA": "^DJA",
    "DXY": "DX-Y.NYB",
    "NASDAQ": "^IXIC",
    "VIX": "^VIX",
    "VXN": "^VXN",
    "VXO": "^VXO"
}
indices_oanda = {
    "NYA": "US2000_USD",
    "DJI": "US30_USD",
    "FTSE": "UK100_GBP",
    "FCHI": "FR40_EUR",
    "DAX": "DE30_EUR",
    "SSE": "CN50_USD",
    "HSI": "HK33_HKD",
    "Nikkei": "JP225_USD"
}
sectors_stooq = {
    "Consumer disc.": "BI.F",
    "Consumer stap.": "BL.F",
    "Energy": "BM.F",
    "Financials": "BN.F",
    "Health care": "BS.F",
    "Industrials": "JG.F",
    "Inf. tech.": "JB.F",
    "Materials": "JA.F",
    "Utilities": "JE.F",
    "Real state": "BK.F"
}
# TODO: find other than alpha-vantage for stocks
# stocks_avantage = {
#     "**AAPL": "AAPL",
#     "*MSFT": "MSFT",
#     "AMZN": "AMZN",
#     "*GOOG": "GOOG",
#     "FB": "FB",
#     "*JPM": "JPM",
#     "***BRK.B": "BRK-B",
#     "*JNJ": "JNJ",
#     "**XOM": "XOM"
# }
stocks_stooq = {
    "AAPL": "AAPL.US",
    "MSFT": "MSFT.US",
    "AMZN": "AMZN.US",
    "FB": "FB.US",
    "JPM": "JPM.US",
    "BRK-B": "BRK-B.US",
    "JNJ": "JNJ.US",
    "GOOG": "GOOG.US",
    "XOM": "XOM.US"
}
cfds_oanda = {
    "Crude Oil": "WTICO_USD",
    "Gold": "XAU_USD"
}
currencies_oanda = {
    "USDCAD": "USD_CAD",
    "EURUSD": "EUR_USD",
    "GBPUSD": "GBP_USD",
    "USDCNY": "USD_CNY",
    "USDJPY": "USD_JPY"
}
if exist("../data/pickles/filter-sp500-datasets.p"):
    sp500, major_indices, sp500_sectors, major_stocks, major_cfds, major_currencies, major_economical = pk.load(open("../data/pickles/filter-sp500-datasets.p", "rb"))
else:
    # descargar S&P 500
    sp500 = historical.Candles(
        client=oanda_client,
        instrument="SPX500_USD",
        resolution="D",
        from_date=from_date.strftime("%Y-%m-%d"),
        to_date=to_date.strftime("%Y-%m-%d"),
        datetime_fmt="%Y-%m-%d",
        timezone="America/New_York"
    ).as_dataframe(index_name="Date")

    major_indices, sp500_sectors, major_stocks, major_cfds, major_currencies = {}, {}, {}, {}, {}
    # descargar indices
    for symbol in indices_avantage:
        major_indices[symbol], _ = avantage_client.get_daily_adjusted(indices_avantage[symbol], outputsize="full")
        major_indices[symbol].index.name = "Date"
        major_indices[symbol].rename(col_mapping, axis="columns", inplace=True)
        major_indices[symbol].sort_index(inplace=True)
        major_indices[symbol].index = pd.to_datetime(major_indices[symbol].index)
        major_indices[symbol].replace(to_replace=0, value=np.nan, inplace=True)
        major_indices[symbol].dropna(how="all", axis="index", inplace=True)
    for symbol in indices_oanda:
        major_indices[symbol] = historical.Candles(
            client=oanda_client,
            instrument=indices_oanda[symbol],
            resolution="D",
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            datetime_fmt="%Y-%m-%d",
            timezone="America/New_York"
        ).as_dataframe(index_name="Date")
    # descargar sectores
    for symbol in sectors_stooq:
        sp500_sectors[symbol] = data.get_data_stooq(sectors_stooq[symbol]).sort_index()
    # descargar stocks
    for symbol in stocks_stooq:
        major_stocks[symbol] = data.get_data_stooq(stocks_stooq[symbol]).sort_index()
#     for symbol in stocks_avantage:
#         major_stocks[symbol], _ = avantage_client.get_daily_adjusted(indices_avantage[symbol], outputsize="full")
#         major_stocks[symbol].index.name = "Date"
#         major_stocks[symbol].rename(col_mapping, axis="columns", inplace=True)
#         major_stocks[symbol].sort_index(inplace=True)
#         major_stocks[symbol].index = pd.to_datetime(major_stocks[symbol].index)
#         major_stocks[symbol].replace(to_replace=0, value=np.nan, inplace=True)
#         major_stocks[symbol].dropna(how="all", axis="index", inplace=True)
    # descargar CFDs
    for symbol in cfds_oanda:
        major_cfds[symbol] = historical.Candles(
            client=oanda_client,
            instrument=cfds_oanda[symbol],
            resolution="D",
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            datetime_fmt="%Y-%m-%d",
            timezone="America/New_York"
        ).as_dataframe(index_name="Date")
    # descargar monedas
    for symbol in currencies_oanda:
        major_currencies[symbol] = historical.Candles(
            client=oanda_client,
            instrument=currencies_oanda[symbol],
            resolution="D",
            from_date=from_date.strftime("%Y-%m-%d"),
            to_date=to_date.strftime("%Y-%m-%d"),
            datetime_fmt="%Y-%m-%d",
            timezone="America/New_York"
        ).as_dataframe(index_name="Date")
    # cargar datos económicos
    major_economical = pd.read_csv("../data/processed/filter-analysis/economic-indicators-short-actual.csv", index_col="Date", parse_dates=True).sort_index()
    # guardar pickle
    pk.dump((sp500, major_indices, sp500_sectors, major_stocks, major_cfds, major_currencies, major_economical), open("../data/pickles/filter-sp500-datasets.p", "wb"))

In [5]:
# indices
X_indices = pd.DataFrame(
    index=sp500.index,
    columns=pd.MultiIndex.from_tuples(
        tuples=list(it.product(["Indices"], sorted(major_indices.keys()))),
        names=["Type", "Name"]
    ),
    data=None
)
for symbol, df in major_indices.iteritems(): X_indices["Indices", symbol] = df.Close.shift()
# sectors
X_sectors = pd.DataFrame(
    index=sp500.index,
    columns=pd.MultiIndex.from_tuples(
        tuples=list(it.product(["S&P 500 sectors"], sorted(sp500_sectors.keys()))),
        names=["Type", "Name"]
    ),
    data=None
)
for symbol, df in sp500_sectors.iteritems(): X_sectors["S&P 500 sectors", symbol] = df.Close.shift()
# stocks
X_stocks = pd.DataFrame(
    index=sp500.index,
    columns=pd.MultiIndex.from_tuples(
        tuples=list(it.product(["Stocks"], sorted(major_stocks.keys()))),
        names=["Type", "Name"]
    ),
    data=None
)
for symbol, df in major_stocks.iteritems(): X_stocks["Stocks", symbol] = df.Close.shift()
# CFDs
X_cfds = pd.DataFrame(
    index=sp500.index,
    columns=pd.MultiIndex.from_tuples(
        tuples=list(it.product(["CFDs"], sorted(major_cfds.keys()))),
        names=["Type", "Name"]
    ),
    data=None
)
for symbol, df in major_cfds.iteritems(): X_cfds["CFDs", symbol] = df.Close.shift()
# currencies
X_currencies = pd.DataFrame(
    index=sp500.index,
    columns=pd.MultiIndex.from_tuples(
        tuples=list(it.product(["Currencies"], sorted(major_currencies.keys()))),
        names=["Type", "Name"]
    ),
    data=None
)
for symbol, df in major_currencies.iteritems(): X_currencies["Currencies", symbol] = df.Close.shift()
# technical indicators
X_technical = pd.DataFrame(
    index=sp500.index,
    columns=pd.MultiIndex.from_tuples(
        tuples=list(it.product(["Technical"], ["%R", "CCI", "MOM", "ROC", "RSI"])),
        names=["Type", "Name"]
    ),
    data=None
)
X_technical["%R"] = WILLR(sp500.High.shift(), sp500.Low.shift(), sp500.Close.shift(), 5)
X_technical["CCI"] = CCI(sp500.High.shift(), sp500.Low.shift(), sp500.Close.shift(), 5)
X_technical["MOM"] = MOM(sp500.Close.shift(), 5)
X_technical["ROC"] = ROC(sp500.Close.shift(), 5)
X_technical["RSI"] = RSI(sp500.Close.shift(), 5)
# economical
major_economical.sort_index(axis="columns", inplace=True)
X_economical = pd.DataFrame(
    index=major_economical.index,
    columns=pd.MultiIndex.from_tuples(
        tuples=list(it.product(["Economical"], major_economical.columns.values)),
        names=["Type", "Name"]
    ),
    data=major_economical.values.copy()
)
# master
# TODO: ordenar columnas por tipo de indicador
X_raw = pd.concat(
    objs=(
        X_indices.pct_change(),
        X_sectors.pct_change(),
        X_stocks.pct_change(),
        X_cfds.pct_change(),
        X_currencies.pct_change(),
        X_technical,
        X_economical
    ),
    axis="columns"
)
X_raw.insert(loc=0, column=("Indices", "S&P 500"), value=sp500.Close.shift().pct_change())
# filtrar NANs
X_raw.dropna(how="all", axis="index", inplace=True)
sp500 = sp500.filter(items=X_raw.index, axis="index")
Y_raw = pd.DataFrame(
    index=sp500.index,
    data={
        "S&P 500 Price": sp500.Close,
        "S&P 500 Return": sp500.Close.pct_change(),
        "Class": (sp500.Close.pct_change() >= 0.0).astype(int),
    }
)

In [6]:
X_raw.tail()

Type,Indices,Indices,Indices,Indices,Indices,Indices,Indices,Indices,Indices,Indices,...,RSI,Economical,Economical,Economical,Economical,Economical,Economical,Economical,Economical,Economical
Name,S&P 500,DAX,DJA,DJI,DXY,FCHI,FTSE,HSI,NASDAQ,NYA,...,Unnamed: 12_level_1,ADP nonfarm employment change,Average hourly earnings,CB consumer confidence,Core CPI,GDP,Interest rate decision,Nonfarm payrolls,PPI,Unemployment rate
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-04-26,0.009837,0.00876,0.004245,0.00835,0.004407,0.008761,0.008304,-0.000271,-0.000515,0.004023,...,54.809448,,,,,,,,,
2018-04-29,-0.000375,0.003067,,-0.000325,0.004058,0.002743,0.009538,0.00432,,-0.002933,...,54.133966,,,,,,,,,
2018-04-30,-0.009894,,0.009373,-0.006977,,,-0.002076,,0.016571,-0.009748,...,38.483001,,,,,,,,,
2018-05-01,0.003255,-0.001906,-0.007187,-0.001295,0.003277,0.003009,0.005107,0.009134,-0.007518,0.007171,...,44.962408,,,,,,,,,
2018-05-02,-0.010414,0.01711,,-0.011087,0.006642,0.005181,-0.002056,-0.004132,0.009118,-0.000811,...,31.607284,,,,,,,,,


In [7]:
Y_raw.tail()

Unnamed: 0_level_0,Class,S&P 500 Price,S&P 500 Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-04-26,0,2668.2,-0.000375
2018-04-29,0,2641.8,-0.009894
2018-04-30,1,2650.4,0.003255
2018-05-01,0,2622.8,-0.010414
2018-05-02,1,2629.0,0.002364


In [8]:
# guardar tablas crudas
X_raw.reset_index().to_csv("../data/processed/sp500-ml/features-raw.csv", index=False)
Y_raw.reset_index().to_csv("../data/processed/sp500-ml/target-raw.csv", index=False)

# guardar tablas de los últimos ~20 años
X_short = X_raw.dropna(how="all", axis=0)
X_short = X_short[X_short.index>"1998-01-01"]
Y_short = Y_raw.filter(items=X_short.index)
X_short.reset_index().to_csv("../data/processed/sp500-ml/features.csv", index=False)
Y_short.reset_index().to_csv("../data/processed/sp500-ml/target.csv", index=False)