In [6]:
import pandas as pd
import numpy as np

# Library configuration
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
# Set output options
pd.set_option('display.max_columns', 3000)
pd.options.display.max_rows = 999

In [7]:
df_apple = pd.read_csv('data/charts/APPLE1440.csv', header=None, names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'], index_col=False)
df_apple.head()

df_amazon = pd.read_csv('data/charts/AMAZON1440.csv', header=None, names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'], index_col=False)
df_amazon.head()

Unnamed: 0,date,time,open,high,low,close,volume
0,2014.06.30,00:00,93.159,94.194,93.028,93.46,2799
1,2014.07.01,00:00,93.973,94.546,93.611,93.993,4787
2,2014.07.02,00:00,94.344,94.515,93.54,93.902,4476
3,2014.07.03,00:00,94.113,94.545,93.661,94.495,3454
4,2014.07.07,00:00,94.936,96.423,94.936,96.403,4638


Unnamed: 0,date,time,open,high,low,close,volume
0,2014.09.29,00:00,320.12,323.6,319.5,321.82,7187
1,2014.09.30,00:00,320.81,323.38,318.6,322.44,8364
2,2014.10.01,00:00,321.27,321.47,315.52,317.46,12002
3,2014.10.02,00:00,316.16,318.4,311.31,318.4,15487
4,2014.10.03,00:00,321.29,325.1,319.5,322.61,11113


Generate features for a time series of stock data.

`direction`:
$$
Dir(d) = \left\{
    \begin{array}{ll}
        \ \ \ 1 \ \ \ \text{if} \ \ Open(d) \le Close(d) \\
        -1 \ \ \  \text{otherwise}
    \end{array}
\right.
$$

`pct_chng`: the percent change between the opening price and closing price.

`prevClose_<i>`: the closing price `i` days ago.

`prevClose_<i>_pctChng`: the percent change from `i` days ago closing price to the previous trading days close price.

In [8]:
def generate_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')

    # df['direction'] = (df['close'] >= df['open']).astype(int) # use -1 instead of 0 for logistic regression
    df['direction'] = 1
    df.loc[df['open'] > df['close'], 'direction'] = -1

    df['pct_chng'] = (df['close'] - df['open']) / df['close'] * 100

    for i in range(1,26):
        df[f'prevClose_{i}'] = df['close'].shift(i)
        # df[f'prevClose_{i}_pctChng'] = df['close'].shift(1).pct_change(i-1)
        # df[f'prevClose_{i}_logReturn'] = np.log(df.price) - np.log(df.price.shift(1))

    return df

In [9]:
df_apple = generate_features(df_apple)
df_apple.head(5)

df_amazon = generate_features(df_amazon)
df_amazon.tail(5)

Unnamed: 0,date,time,open,high,low,close,volume,direction,pct_chng,prevClose_1,prevClose_2,prevClose_3,prevClose_4,prevClose_5,prevClose_6,prevClose_7,prevClose_8,prevClose_9,prevClose_10,prevClose_11,prevClose_12,prevClose_13,prevClose_14,prevClose_15,prevClose_16,prevClose_17,prevClose_18,prevClose_19,prevClose_20,prevClose_21,prevClose_22,prevClose_23,prevClose_24,prevClose_25
0,2014-06-30,00:00,93.159,94.194,93.028,93.46,2799,1,0.322063,,,,,,,,,,,,,,,,,,,,,,,,,
1,2014-07-01,00:00,93.973,94.546,93.611,93.993,4787,1,0.021278,93.46,,,,,,,,,,,,,,,,,,,,,,,,
2,2014-07-02,00:00,94.344,94.515,93.54,93.902,4476,-1,-0.470703,93.993,93.46,,,,,,,,,,,,,,,,,,,,,,,
3,2014-07-03,00:00,94.113,94.545,93.661,94.495,3454,1,0.404254,93.902,93.993,93.46,,,,,,,,,,,,,,,,,,,,,,
4,2014-07-07,00:00,94.936,96.423,94.936,96.403,4638,1,1.521737,94.495,93.902,93.993,93.46,,,,,,,,,,,,,,,,,,,,,


Unnamed: 0,date,time,open,high,low,close,volume,direction,pct_chng,prevClose_1,prevClose_2,prevClose_3,prevClose_4,prevClose_5,prevClose_6,prevClose_7,prevClose_8,prevClose_9,prevClose_10,prevClose_11,prevClose_12,prevClose_13,prevClose_14,prevClose_15,prevClose_16,prevClose_17,prevClose_18,prevClose_19,prevClose_20,prevClose_21,prevClose_22,prevClose_23,prevClose_24,prevClose_25
1089,2019-01-28,00:00,1656.96,1662.25,1614.07,1633.73,41878,-1,-1.4219,1670.81,1649.99,1642.1,1633.0,1694.82,1687.02,1678.02,1675.98,1618.0,1640.08,1654.5,1656.6,1653.0,1629.03,1575.66,1500.53,1505.0,1503.37,1477.95,1457.86,1469.1,1342.73,1377.54,1478.5,1494.97
1090,2019-01-29,00:00,1638.23,1639.42,1584.97,1612.87,39136,-1,-1.572352,1633.73,1670.81,1649.99,1642.1,1633.0,1694.82,1687.02,1678.02,1675.98,1618.0,1640.08,1654.5,1656.6,1653.0,1629.03,1575.66,1500.53,1505.0,1503.37,1477.95,1457.86,1469.1,1342.73,1377.54,1478.5
1091,2019-01-30,00:00,1616.53,1685.61,1616.53,1684.04,39652,1,4.008812,1612.87,1633.73,1670.81,1649.99,1642.1,1633.0,1694.82,1687.02,1678.02,1675.98,1618.0,1640.08,1654.5,1656.6,1653.0,1629.03,1575.66,1500.53,1505.0,1503.37,1477.95,1457.86,1469.1,1342.73,1377.54
1092,2019-01-31,00:00,1703.57,1776.99,1670.01,1709.98,51953,1,0.374858,1684.04,1612.87,1633.73,1670.81,1649.99,1642.1,1633.0,1694.82,1687.02,1678.02,1675.98,1618.0,1640.08,1654.5,1656.6,1653.0,1629.03,1575.66,1500.53,1505.0,1503.37,1477.95,1457.86,1469.1,1342.73
1093,2019-02-01,00:00,1638.98,1672.0,1624.34,1624.58,40974,-1,-0.886383,1709.98,1684.04,1612.87,1633.73,1670.81,1649.99,1642.1,1633.0,1694.82,1687.02,1678.02,1675.98,1618.0,1640.08,1654.5,1656.6,1653.0,1629.03,1575.66,1500.53,1505.0,1503.37,1477.95,1457.86,1469.1


In [10]:
df_apple.to_csv('data/charts/ApplePriceData.csv', index=False)
df_amazon.to_csv('data/charts/AmazonPriceData.csv', index=False)