# Train a Deep NN to predict Asset Price movements

## Imports & Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

In [3]:
np.random.seed(42)

In [4]:
idx = pd.IndexSlice

## Build daily dataset

In [5]:
prices = (pd.read_hdf('../Data_Processing/Data/H5/assets.h5', 'Alt_Energy/prices')
          .loc[idx['2019':'2021', :], ['Close', 'Volume']])
prices.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5544 entries, (Timestamp('2019-01-07 00:00:00'), 'BE') to (Timestamp('2021-01-05 00:00:00'), 'VWSYF')
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   5544 non-null   float64
 1   Volume  5544 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 107.0+ KB


In [6]:
n_dates = len(prices.index.unique('Date'))
dollar_vol = (prices.Close.mul(prices.Volume)
              .unstack('Ticker')
              .dropna(thresh=int(.95 * n_dates), axis=1)
              .rank(ascending=False, axis=1)
              .stack('Ticker'))

In [7]:
returns = (prices.loc[idx[:], 'Close']
           .unstack('Ticker')
           .pct_change()
           .sort_index(ascending=False))
returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 504 entries, 2021-01-05 to 2019-01-07
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BE      503 non-null    float64
 1   BWEN    503 non-null    float64
 2   CSIQ    503 non-null    float64
 3   DOGEF   503 non-null    float64
 4   ENPH    503 non-null    float64
 5   FSLR    503 non-null    float64
 6   NEE     503 non-null    float64
 7   RUN     503 non-null    float64
 8   SEDG    503 non-null    float64
 9   SIEGY   503 non-null    float64
 10  VWSYF   503 non-null    float64
dtypes: float64(11)
memory usage: 47.2 KB


### Stack 21-day time series

In [8]:
n = len(prices)
T = 21 # days
tcols = list(range(T))
tickers = returns.columns

In [9]:
data = pd.DataFrame()
for i in range(n-T-1):
    df = returns.iloc[i:i+T+1]
    date = df.index.max()
    data = pd.concat([data, 
                      df.reset_index(drop=True).T
                      .assign(date=date, ticker=tickers)
                      .set_index(['ticker', 'date'])])
data = data.rename(columns={0: 'label'}).sort_index().dropna()
data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5302 entries, ('BE', Timestamp('2019-02-07 00:00:00')) to ('VWSYF', Timestamp('2021-01-05 00:00:00'))
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5302 non-null   float64
 1   1       5302 non-null   float64
 2   2       5302 non-null   float64
 3   3       5302 non-null   float64
 4   4       5302 non-null   float64
 5   5       5302 non-null   float64
 6   6       5302 non-null   float64
 7   7       5302 non-null   float64
 8   8       5302 non-null   float64
 9   9       5302 non-null   float64
 10  10      5302 non-null   float64
 11  11      5302 non-null   float64
 12  12      5302 non-null   float64
 13  13      5302 non-null   float64
 14  14      5302 non-null   float64
 15  15      5302 non-null   float64
 16  16      5302 non-null   float64
 17  17      5302 non-null   float64
 18  18      5302 non-null   float64
 19  19      5302 non-null   float64
 

In [10]:
data.shape

(5302, 22)

In [11]:
data.to_hdf('data.h5', 'returns_daily')

## Build weekly dataset

We load the Quandl adjusted stock price data:

In [15]:
prices = (pd.read_hdf('../Data_Processing/Data/H5/assets.h5', 'Alt_Energy/prices')
              .Close
          .unstack().loc['2019':])
prices.head()

Ticker,BE,BWEN,CSIQ,DOGEF,ENPH,FSLR,NEE,RUN,SEDG,SIEGY,VWSYF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-07,12.41,1.61,16.799999,58.250946,5.26,45.790001,40.522408,10.56,35.009998,47.956219,74.181648
2019-01-08,12.73,1.6,17.129999,57.650864,5.35,46.23,40.818771,10.71,35.0,48.032772,74.181648
2019-01-09,12.14,1.6,17.209999,57.650864,5.63,46.959999,40.642365,11.47,35.459999,48.415531,77.006691
2019-01-10,12.02,1.53,17.620001,59.151073,5.63,47.740002,41.253918,11.51,35.990002,48.636688,75.81823
2019-01-11,12.24,1.73,17.66,58.979622,5.68,47.5,41.329189,11.65,36.400002,47.658508,75.233742


### Resample to weekly frequency

In [20]:
returns = (prices
           .resample('D')
           .last()
           .pct_change()
           #.dropna(axis=1)
           .sort_index(ascending=True))
returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 730 entries, 2019-01-07 to 2021-01-05
Freq: D
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   BE      729 non-null    float64
 1   BWEN    729 non-null    float64
 2   CSIQ    729 non-null    float64
 3   DOGEF   729 non-null    float64
 4   ENPH    729 non-null    float64
 5   FSLR    729 non-null    float64
 6   NEE     729 non-null    float64
 7   RUN     729 non-null    float64
 8   SEDG    729 non-null    float64
 9   SIEGY   729 non-null    float64
 10  VWSYF   729 non-null    float64
dtypes: float64(11)
memory usage: 68.4 KB


In [21]:
returns.head().append(returns.tail())

Ticker,BE,BWEN,CSIQ,DOGEF,ENPH,FSLR,NEE,RUN,SEDG,SIEGY,VWSYF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-07,,,,,,,,,,,
2019-01-08,0.025786,-0.006211,0.019643,-0.010302,0.01711,0.009609,0.007314,0.014205,-0.000286,0.001596,0.0
2019-01-09,-0.046347,0.0,0.00467,0.0,0.052336,0.015791,-0.004322,0.070962,0.013143,0.007969,0.038083
2019-01-10,-0.009885,-0.04375,0.023823,0.026022,0.0,0.01661,0.015047,0.003487,0.014947,0.004568,-0.015433
2019-01-11,0.018303,0.130719,0.00227,-0.002899,0.008881,-0.005027,0.001825,0.012163,0.011392,-0.020112,-0.007709
2021-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-04,-0.047802,-0.003783,0.027322,0.025043,-0.018408,0.023453,-0.037978,0.008648,-0.024348,0.000418,0.034137
2021-01-05,0.039208,-0.072152,-0.040274,-0.006038,0.053646,-0.089688,0.00741,0.044441,0.017537,0.00905,-0.005158


### Create & stack 52-week sequences

We'll use 52-week sequences, which we'll create in a stacked format:

In [22]:
n = len(returns)
T = 52 # weeks
tcols = list(range(T))
tickers = returns.columns

In [23]:
data = pd.DataFrame()
for i in range(n-T-1):
    df = returns.iloc[i:i+T+1]
    date = df.index.max()    
    data = pd.concat([data, (df.reset_index(drop=True).T
                             .assign(date=date, ticker=tickers)
                             .set_index(['ticker', 'date']))])
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7447 entries, ('BE', Timestamp('2019-02-28 00:00:00')) to ('VWSYF', Timestamp('2021-01-04 00:00:00'))
Data columns (total 53 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       7436 non-null   float64
 1   1       7447 non-null   float64
 2   2       7447 non-null   float64
 3   3       7447 non-null   float64
 4   4       7447 non-null   float64
 5   5       7447 non-null   float64
 6   6       7447 non-null   float64
 7   7       7447 non-null   float64
 8   8       7447 non-null   float64
 9   9       7447 non-null   float64
 10  10      7447 non-null   float64
 11  11      7447 non-null   float64
 12  12      7447 non-null   float64
 13  13      7447 non-null   float64
 14  14      7447 non-null   float64
 15  15      7447 non-null   float64
 16  16      7447 non-null   float64
 17  17      7447 non-null   float64
 18  18      7447 non-null   float64
 19  19      7447 non-null   float64
 

In [24]:
data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))

In [25]:
data = data.rename(columns={0: 'fwd_returns'})

In [26]:
data['label'] = (data['fwd_returns'] > 0).astype(int)

In [27]:
data.shape

(7447, 54)

In [28]:
data.sort_index().to_hdf('data.h5', 'returns_weekly')