# Selección de features para el S&P 500

## Procedimientos

- Descargar datos:
    - Serie S&P 500
    - Serie para las compañias
    - Series para las monedas
    - Series para los indices
- Calcular retornos
- Calcular variable de clase
- Calcular desfase temporal del retorno
- Calcular indicadores técnicos
- Calcular filtrado de features:
    - Filtrar features con mayor poder predictivo con la variable de clase
    - Filtrar features más independientes entre sí

In [1]:
%pylab inline

import pickle as pk
import pandas as pd
import itertools as it
from pandas_datareader import data
import fix_yahoo_finance

fix_yahoo_finance.pdr_override()

from datetime import datetime

from pyCBT.providers.scrappers import slickcharts, wikipedia
from pyCBT.common.files import exist
from talib import RSI, CCI, ROC, MOM, WILLR

Populating the interactive namespace from numpy and matplotlib


In [2]:
sp500_slick = slickcharts.get_sp500_metadata()
sp500_slick.set_index(keys="Symbol", inplace=True)
sp500_slick.sort_index(inplace=True)

sp500_wiki = wikipedia.get_sp500_metadata()
sp500_wiki.set_index(keys="Symbol", inplace=True)
sp500_wiki.sort_index(inplace=True)
sp500_wiki.drop(columns="Company", inplace=True)

sp500_md = pd.concat((sp500_wiki, sp500_slick), axis=1)
sp500_md.head()

Unnamed: 0_level_0,Sector,Company,Weight
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,Health Care,Agilent Technologies Inc.,240.0
AAL,Industrials,American Airlines Group Inc.,271.0
AAP,Consumer Discretionary,Advance Auto Parts Inc.,458.0
AAPL,Information Technology,Apple Inc.,1.0
ABBV,Health Care,AbbVie Inc.,28.0


In [3]:
# definir fechas
from_date, to_date = datetime(2005, 1, 1), datetime.today().date()
# descargar datos
# TODO: use alpha-vantage instead
indices_yahoo = {
    "DJI": "^DJI",
    "HSI": "^HSI",
    "FCHI": "^FCHI",
    "NYA": "^NYA",
    "VIX": "^VIX"
}
# TODO: use alpha-vantage instead
indices_stooq = {
    "DXY": "USD_I",
    "FTSE": "X.F",
    "DAX": "^DAX",
    "DJA": "DIA.US",
    "NASDAQ": "^NDX",
    "N225": "^NKX",
    "SSE": "^SHC",
    "VXN": "R3.C",
    "VXO": "VI.F"
}
# TODO: use alpha-vantage instead
stocks_stooq = {
    "AAPL": "AAPL.US",
    "MSFT": "MSFT.US",
    "AMZN": "AMZN.US",
    "FB": "FB.US",
    "JPM": "JPM.US",
    "BRK.B": "BRK-B.US",
    "JNJ": "JNJ.US",
    "GOOG": "GOOG.US",
    "XOM": "XOM.US"
}
# TODO: use pyCBT instead
currencies_yahoo = {
    "USDCAD": "CAD=X",
    "GBPUSD": "GBPUSD=X",
    "USDCNY": "CNY=X",
    "USDJPY": "JPY=X",
    "EURUSD": "EURUSD=X"
}

if False:#exist("../data/pickles/filter-sp500-datasets.p"):
    sp500, major_stocks, major_indices, major_currencies = pk.load(open("../data/pickles/filter-sp500-datasets.p", "rb"))
else:
    sp500 = data.get_data_stooq("^SPX")

    major_stocks, major_indices, major_currencies = {}, {}, {}
    
    for ticker in indices_yahoo:
        major_indices[ticker] = data.get_data_yahoo(indices_yahoo[ticker], from_date, to_date)
    for ticker in indices_stooq:
        major_indices[ticker] = data.get_data_stooq(indices_stooq[ticker])
    for ticker in stocks_stooq:
        major_stocks[ticker] = data.get_data_stooq(stocks_stooq[ticker])
    for ticker in currencies_yahoo:
        major_currencies[ticker] = data.get_data_yahoo(currencies_yahoo[ticker], from_date, to_date)
        
    pk.dump((sp500, major_stocks, major_indices, major_currencies), open("../data/pickles/filter-sp500-datasets.p", "wb"))

[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded


In [4]:
# construir tabla master
master_table = pd.DataFrame(index=sp500.index, data=None)
for d in [major_stocks, major_indices, major_currencies]:
    for ticker in sorted(d):
        master_table[ticker] = d[ticker].Close.shift(-1)
# filtrar NANs
master_table.dropna(how="all", axis=0)
print master_table.index[0], master_table.index[-1]
# calcular retornos
Y_return = sp500.Close.pct_change()
X_return = master_table.pct_change()
# calcular indicadores técnicos: rsi, williams_r, cci_symbol, roc, mom
X_techin = pd.DataFrame(index=sp500.index, data=None)
X_techin["RSI"] = RSI(sp500.Close.shift(-1), 5)
X_techin["%R"] = WILLR(sp500.High.shift(-1), sp500.Low.shift(-1), sp500.Close.shift(-1), 5)
X_techin["CCI"] = CCI(sp500.High.shift(-1), sp500.Low.shift(-1), sp500.Close.shift(-1), 5)
X_techin["ROC"] = ROC(sp500.Close.shift(-1), 5)
X_techin["MOM"] = MOM(sp500.Close.shift(-1), 5)
# concatenar retornos + indicadores técnicos
X = pd.concat((X_return, X_techin))
Y = Y_return.astype(int)

AttributeError: 'DataFrame' object has no attribute 'Close'