In [1]:
import os
import sys

import pickle

import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
import sklearn.preprocessing as skp

import scipy.stats as scs
import statsmodels.stats as sms

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
py.init_notebook_mode(connected=True)

In [2]:
from sklearn.linear_model import LogisticRegression

In [3]:
from TCC_pkg import dataanalysis as da
from TCC_pkg import testspecification as tspec
from TCC_pkg import autoregressive as ar


The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.



# <span style='color:crimson'>1.</span> Input Data

## <span style='color:MediumBlue'>1.1.</span> Stock series

In [4]:
series_name = 'aapl'
with open('../Data/{}.pkl'.format(series_name), 'rb') as f:
    df_raw = pickle.load(f)
    
df_raw

Unnamed: 0_level_0,Close,High,Low,Open,Volume,Direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-01-03,42.354338,42.443013,41.746018,41.849279,111284600.0,
2011-01-04,42.575382,42.730884,42.171849,42.723173,77270200.0,1.0
2011-01-05,42.923655,42.967350,42.345342,42.351768,63879900.0,1.0
2011-01-06,42.888956,43.084298,42.782290,43.016108,75107200.0,-1.0
2011-01-07,43.196105,43.225663,42.653776,42.922370,77982800.0,1.0
2011-01-10,44.010241,44.109839,43.331044,43.544377,112140000.0,1.0
2011-01-11,43.905502,44.332168,43.626626,44.321887,111027000.0,-1.0
2011-01-12,44.262770,44.264056,43.951767,44.112409,75647600.0,1.0
2011-01-13,44.424698,44.548071,44.189517,44.357871,74195100.0,1.0
2011-01-14,44.784537,44.784537,44.265341,44.451686,77210000.0,1.0


In [5]:
da.candleplot(df_raw.drop(columns=['Direction', 'Volume']))

## <span style='color:MediumBlue'>1.2.</span> Network Data

## <span style='color:MediumBlue'>1.3.</span> Sentiment Analysis

## <span style='color:MediumBlue'>1.4.</span> News and Online Data

# <span style='color:crimson'>2.</span> Data Cleaning

## <span style='color:MediumBlue'>2.1.</span> Data Overview

In [6]:
df = df_raw.copy()
print('Shape: {}\n'.format(df.shape))
print('Indices: \n{}\n{}\n'.format(df.index[:5], df.index[-5:]))
print('Columns: \n{}\n'.format(df.columns))
print('Describe(): \n{}\n'.format(df.describe()))

Shape: (1818, 6)

Indices: 
DatetimeIndex(['2011-01-03', '2011-01-04', '2011-01-05', '2011-01-06',
               '2011-01-07'],
              dtype='datetime64[ns]', name='Date', freq=None)
DatetimeIndex(['2018-03-21', '2018-03-22', '2018-03-23', '2018-03-26',
               '2018-03-27'],
              dtype='datetime64[ns]', name='Date', freq=None)

Columns: 
Index(['Close', 'High', 'Low', 'Open', 'Volume', 'Direction'], dtype='object')

Describe(): 
             Close         High          Low         Open        Volume  \
count  1818.000000  1818.000000  1818.000000  1818.000000  1.818000e+03   
mean     93.590007    94.390432    92.756979    93.598986  7.538906e+07   
std      36.015008    36.231882    35.780549    36.004098  5.385936e+07   
min      40.523015    40.828878    39.903578    40.697794  1.147592e+07   
25%      65.139696    65.892813    64.477108    65.238411  3.497941e+07   
50%      90.851617    91.566725    89.977604    90.887746  6.085693e+07   
75%     114.83002

## <span style='color:MediumBlue'>2.2.</span> Data Cleaning

In [7]:
# Adjust time-series
df = df_raw.copy()

# 1. Find zeros or negative values
print('Number of zero-values: {}'.format(df[df==0].count().values))

# 2. Find NaNs
print('Number of NaNs: {}'.format(df[df.isna().values].count().values))

# 3. Eliminate entries with NaNs
df = df.dropna(how='any', axis=0)

# 4. Gaps
date_gaps = np.array([(df.index[i]-df.index[i-1]).days for i in range(1,len(df.index))])
print('Average gap: {}'.format(date_gaps.mean()))
print('Max gap: {}'.format(date_gaps.max()))
print('No. of gaps greater than 5: {}'.format(len(date_gaps[date_gaps>5])))
print('15 greatest gaps: {}'.format(np.sort(date_gaps)[-15:]))
print('Dates of greatest gap: \n{}\n'.format(df.iloc[date_gaps.argmax()-1:date_gaps.argmax()+3]))

df_adj = df

Number of zero-values: [0 0 0 0 0 0]
Number of NaNs: [1 1 1 1 1 0]
Average gap: 1.453193832599119
Max gap: 5
No. of gaps greater than 5: 0
15 greatest gaps: [4 4 4 4 4 4 4 4 4 4 4 4 4 4 5]
Dates of greatest gap: 
                Close       High        Low       Open       Volume  Direction
Date                                                                          
2012-10-25  78.668550  80.276928  78.153848  80.018803  164081400.0       -1.0
2012-10-26  77.953801  79.244427  76.275988  78.654611  254608200.0       -1.0
2012-10-31  76.833538  77.690514  75.850081  76.776751  127500800.0       -1.0
2012-11-01  76.990982  77.824739  76.685116  77.207820   90324500.0        1.0



## <span style='color:MediumBlue'>2.3.</span> Outliers

In [8]:
df = df_adj.copy()
ts = df_adj['Close']
n_std = 3

df = pd.concat([df, da.roll_stats(ts=ts, window=30)], axis=1).dropna(how='any',axis=0)
df['Upper'] = df['Rolling Mean'] + n_std*df['Rolling Variance'].apply(np.sqrt)
df['Lower'] = df['Rolling Mean'] - n_std*df['Rolling Variance'].apply(np.sqrt)

da.plotscatter(df.drop(columns='Direction'))

# Outlier criterion
df_out = df[(df['Close'] > df['Upper']) | (df['Close'] < df['Lower'])]
df['Out'] = df.loc[df_out.index, 'Close']

df = df.replace(np.nan, 0) # for plotting
da.plotscatter(df.drop(columns='Direction'))

df = df.drop(df_out.index)
df_adj_noout = df[['Close']]

# <span style='color:crimson'>*.</span> Input-Output Definition

In [9]:
df = df_adj.copy()

input_cols = ['Close','High','Low','Open','Volume']
target_col = ['Direction']

# The last index does not correspond to a sample because there is no information about the next day's direction
X_raw, y_raw = df.loc[df.index[:-1],input_cols], df[target_col].shift(-1).loc[df.index[:-1]]

# <span style='color:crimson'>3.</span> Data Generation

**Good references: [1,6,33,35,39]**

## <span style='color:MediumBlue'>3.1.</span> Technical Indicators

In [10]:
X = X_raw.copy()
X.describe()

Unnamed: 0,Close,High,Low,Open,Volume
count,1816.0,1816.0,1816.0,1816.0,1816.0
mean,93.577059,94.374566,92.74423,93.583385,75389360.0
std,35.972003,36.181699,35.737853,35.954339,53875650.0
min,40.523015,40.828878,39.903578,40.697794,11475920.0
25%,65.17009,65.959239,64.5017,65.344658,34939440.0
50%,90.851617,91.566725,89.977604,90.887746,60856930.0
75%,114.782767,115.398129,113.848545,114.735599,99650600.0
max,181.72,183.5,180.21,182.59,470249500.0


## <span style='color:MediumBlue'>3.*.</span> Final Features

In [11]:
X = X_raw.copy()
y = y_raw.copy()

# Test #1: LAGGED VALS
n_lags = 3
X.shift(1)

X_feat = pd.DataFrame(data=[], index=X_raw.index, columns=[]) # empty dataframe
for lag in range(0,n_lags):
    col_names = {col:col+'(t-%d)'%lag for col in X.columns}
    X_tmp = X.shift(periods=lag)
    X_tmp = X_tmp.rename(columns=col_names)
    X_feat = pd.concat([X_feat, X_tmp], axis=1)
    
X_feat = X_feat.dropna()
y_feat = y.loc[X_feat.index]

y_feat

Unnamed: 0_level_0,Direction
Date,Unnamed: 1_level_1
2011-01-06,1.0
2011-01-07,1.0
2011-01-10,-1.0
2011-01-11,1.0
2011-01-12,1.0
2011-01-13,1.0
2011-01-14,-1.0
2011-01-18,-1.0
2011-01-19,-1.0
2011-01-20,-1.0


# <span style='color:crimson'>4.</span> Data Transformation

## <span style='color:MediumBlue'>4.1.</span> Scaling/Normalization

Fonte: http://scikit-learn.org/stable/modules/preprocessing.html

In [12]:
# Isso tem que virar uma funcao
def scale_train_test_sets(X_train, X_test):
    # Define scaler
    scaler = skp.MinMaxScaler(feature_range=(-1,1))

    # COLUMN-WISE transformation, i.e., transform each feature independently
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns) 

    # Apply same scaling parameters to test set (during test we do not know future values to scale)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train_scaled, X_test_scaled

# <span style='color:crimson'>5.</span> Feature Selection

## <span style='color:MediumBlue'>5.1.</span> Correlation

## <span style='color:MediumBlue'>5.2.</span> Mutual Information

## <span style='color:MediumBlue'>5.3.</span> Regularization

### <span style='color:Goldenrod'>5.3.1.</span> Lasso

### <span style='color:Goldenrod'>5.3.2.</span> Ridge

### <span style='color:Goldenrod'>5.3.3.</span> Elastic Net

## <span style='color:MediumBlue'>5.4.</span> Feature Importance

# <span style='color:crimson'>6.</span> Feature Extraction

## <span style='color:MediumBlue'>6.1.</span> PCA

## <span style='color:MediumBlue'>6.2.</span> AE

# <span style='color:crimson'>7.</span> Test Design

In [13]:
y = y_feat

margin = 5
test_spec = tspec.TestSpec(indices=y.index, 
                            start_dates=['2017-01'], 
                            window_size=400,
                            margin=margin
                           )


# <span style='color:crimson'>*.</span> Train-test split

# <span style='color:crimson'>8.</span> Comparison Metrics

In [17]:
# BOA IMAGEM PRA EXEMPLIFICAR A IMPORTANCIA DE DIFERENCIAR/USAR INDICADORES RELATIVOS
instance = test_spec.instance[0]
X_train, X_test = X.loc[instance.train_set], X.loc[instance.test_set]
X_train, X_test = scale_train_test_sets(X_train, X_test)
da.plotscatter(X_train)
da.plotscatter(X_test)

# <span style='color:crimson'>9.</span> Models

## <span style='color:MediumBlue'>9.1.</span> Logistic Regression

Fontes:
- http://scikit-learn.org/stable/modules/linear_model.html
- http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

**HYPERPARAMETERS**:
- `penalty` : str, `‘l1’` or `‘l2’`, default: ‘l2’
- `C` : float, default: `1.0` - 
    - Inverse of regularization strength; must be a positive float; smaller values = stronger regularization.
        - Small C: probability curves will look more like straight lines (long transition between 0-1)
        - Large C: probability curves will look more like step function (fast transition between 0-1)

In [14]:
# Consider 5 options 
# Hyperparameters
model_func = LogisticRegression
regularizers = ['l1','l2']
reg_coeff = [0.01, 0.033, 0.66, 0.1, 0.33, 0.66, 1, 3.33, 10]
kwparams_list = list({'penalty':reg, 'C':C} for C in reg_coeff for reg in regularizers) 

import warnings
warnings.filterwarnings(action='ignore')

# Observation:
# X = (samples/examples x features/inputs) = (m x n)
X = X_feat.copy()
y = y_feat.copy()

# Define test instance
instance = test_spec.instance[0]

# Define test set
X_test = instance.test_set

# Cross-validation
best_perf = 0
for kwparams in kwparams_list:
    print(kwparams)
    kwparams_perf = 0
    for fold in range(instance.CrossValidation.n_folds):
        # Get cross-validation sets
        train_ind = instance.expanding_window_cv.train_sets[fold]
        val_ind = instance.expanding_window_cv.val_sets[fold]
        X_train, X_val = X.loc[train_ind], X.loc[val_ind]
        y_train, y_val = y.loc[train_ind], y.loc[val_ind]

        assert all(X_train.index == y_train.index) and all(X_val.index == y_val.index)
        
        # Scale values
        X_train, X_val = scale_train_test_sets(X_train, X_val)
        y_train, y_val = y_train.values.ravel(), y_val.values.ravel() # transforms into array (m,)

        # Fit logistic regression model
        log_reg = LogisticRegression(**kwparams)
        log_reg.fit(X_train, y_train)

        y_pred = log_reg.predict(X_val)
        y_pred_prob = log_reg.predict_proba(X_val) # use for AUC-ROC

        # pd.DataFrame(data=da.classification_metrics(y_true=y_test, y_pred=y_pred), columns=['LR'])
        fold_perf = da.classification_metrics(y_true=y_val, y_pred=y_pred)
        kwparams_perf += fold_perf['5. Accuracy']
        
    if kwparams_perf > best_perf:
        best_perf = kwparams_perf
        best_params = kwparams
        
# Train on entire train set
X_train, X_test = X.loc[instance.train_set], X.loc[instance.test_set]
y_train, y_test = y.loc[instance.train_set], y.loc[instance.test_set]

# Scale values
X_train, X_test = scale_train_test_sets(X_train, X_test)
y_train, y_test = y_train.values.ravel(), y_test.values.ravel() # transforms into array (m,)

log_reg = LogisticRegression(**best_params)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test) # use for AUC-ROC

# Test performance
ml_res = {}
ml_res['LR'] = da.classification_metrics(y_true=y_test, y_pred=y_pred)
test_perf = pd.DataFrame(data=ml_res)                                     

{'penalty': 'l1', 'C': 0.01}
{'penalty': 'l2', 'C': 0.01}
{'penalty': 'l1', 'C': 0.033}
{'penalty': 'l2', 'C': 0.033}
{'penalty': 'l1', 'C': 0.66}
{'penalty': 'l2', 'C': 0.66}
{'penalty': 'l1', 'C': 0.1}
{'penalty': 'l2', 'C': 0.1}
{'penalty': 'l1', 'C': 0.33}
{'penalty': 'l2', 'C': 0.33}
{'penalty': 'l1', 'C': 0.66}
{'penalty': 'l2', 'C': 0.66}
{'penalty': 'l1', 'C': 1}
{'penalty': 'l2', 'C': 1}
{'penalty': 'l1', 'C': 3.33}
{'penalty': 'l2', 'C': 3.33}
{'penalty': 'l1', 'C': 10}
{'penalty': 'l2', 'C': 10}


In [15]:
test_perf

Unnamed: 0,LR
1. Mathews_CorrCoef,0.0
2. F-beta_0.5,0.0
3. F1,0.0
4. Cohen_Kappa,0.0
5. Accuracy,0.469055
5. Precision,0.0
5. Recall,0.0
FN,163.0
FP,0.0
TN,144.0


# <span style='color:crimson'>10.</span> Tests

# <span style='color:crimson'>11.</span> Results

# <span style='color:crimson'>12.</span> Comparisons