In [1]:
#import needed packages
import yfinance as yf
from sklearn.feature_selection import VarianceThreshold
import pandas as pd
import numpy as np
import statistics as stats

In [9]:
#set Ticker for Amazon inc, set date ranged and Pull data.
#date ranges are blank b/c differnt data ranges were used for the basic and advanced stock data data sets
amzn = yf.Ticker("AMZN")
print(amzn)
StockData = amzn.history(period="max")
StockData = amzn.history(start="",  end="")

yfinance.Ticker object <AMZN>


In [12]:
#set next day open for basic stock data
StockData['nextDayOpen'] = StockData["Open"].shift(-1)
StockData.to_csv('StockData.csv')

In [5]:
#save stock data
df = pd.read_csv("StockData.csv")

Funtions used to extract features from stock data

In [7]:
#Relative Strength Index
def RSI(period):
    delta = df['Close'].diff()
    up, down = delta.copy(), delta.copy()        
    up[up < 0] = 0
    down[down > 0] = 0
    roll_up = up.ewm(com=period - 1, adjust=False).mean()
    roll_down = down.ewm(com=period - 1, adjust=False).mean().abs()     
    rs = roll_up / roll_down
    rsi = 100-(100/(1+rs))   
    return rsi

In [8]:
#weighted moving average
def WMA(n):
    weights = np.arange(1,n+1)
    wma = df['Close'].rolling(n).apply(lambda prices: np.dot(prices, weights)/weights.sum(), raw=True)
    return wma

In [9]:
#On-Balance Volume
def OBV():
    if (df["Close"] > df["Close"].shift(1)).bool :
        ret = df["OBV"].shift(1) + df["Volume"]
    elif df["Close"] < df["Close"].shift(1):
        ret = df["OBV"].shift(1) - df["Volume"]
    else:
        ret = df["OBV"].shift(1)
    return ret

In [32]:
#Bollinger Bands
def BollingerBand(n):
    name = "BollingerBand_Up_" + str(n) + "_1std"
    df[name] = df['Close'].rolling(n).mean() + df['Close'].rolling(n).std()
    name = "BollingerBand_down_" + str(n) + "_1std"
    df[name] = df['Close'].rolling(n).mean() - df['Close'].rolling(n).std()
    name = "BollingerBand_Up_" + str(n) + "_2std"
    df[name] = df['Close'].rolling(n).mean() + 2*df['Close'].rolling(n).std()
    name = "BollingerBand_down_" + str(n) + "_2std"
    df[name] = df['Close'].rolling(n).mean() - 2*df['Close'].rolling(n).std()    

All the indicators used

In [11]:
df["close_to_open"] = df["Close"] / df["Open"]
df["close_to_high"] = df["Close"] / df["High"]
df["close_to_low"] = df["Close"] / df["Low"]
df["High_to_low"] = df["High"] / df["Low"]

In [12]:
df["3d_momentum"] = df["Close"] / df["Close"].shift(3)
df["5d_momentum"] = df["Close"] / df["Close"].shift(5)
df["10d_momentum"] = df["Close"] / df["Close"].shift(10)

In [13]:
df["MovingAVG_5day"] = df["Close"].rolling(5).mean()
df["MovingAVG_10day"] = df["Close"].rolling(10).mean()
df["MovingAVG_50day"] = df["Close"].rolling(50).mean()
df["MovingAVG_100day"] = df["Close"].rolling(100).mean()
df["MovingAVG_20day"] = df["Close"].rolling(20).mean()

In [14]:
df["WeightedMovingAVG_5day"] = WMA(5)
df["WeightedMovingAVG_10day"] = WMA(10)
df["WeightedMovingAVG_50day"] = WMA(50)
df["WeightedMovingAVG_100day"] = WMA(100)
df["WeightedMovingAVG_20day"] = WMA(20)

In [15]:
df["ExponentialMovingAVG_5day"] = df['Close'].ewm(span=5).mean()
df["ExponentialMovingAVG_10day"] = df['Close'].ewm(span=10).mean()
df["ExponentialMovingAVG_50day"] = df['Close'].ewm(span=50).mean()
df["ExponentialMovingAVG_100day"] = df['Close'].ewm(span=100).mean()
df["ExponentialMovingAVG_20day"] = df['Close'].ewm(span=20).mean()

In [16]:
df["MACD_5-10day"] = df["ExponentialMovingAVG_5day"] - df["ExponentialMovingAVG_10day"]
df["MACD_10-20day"] = df["ExponentialMovingAVG_10day"] - df["ExponentialMovingAVG_20day"]
df["MACD_20-50day"] = df["ExponentialMovingAVG_20day"] - df["ExponentialMovingAVG_50day"]
df["MACD_50-100day"] = df["ExponentialMovingAVG_50day"] - df["ExponentialMovingAVG_100day"]

In [17]:
df["2weekSignal_5-10day"] = df["MACD_5-10day"].ewm(span=10).mean()
df["2weekSignal_10-20day"] = df["MACD_10-20day"].ewm(span=10).mean()
df["2weekSignal_20-50day"] = df["MACD_20-50day"].ewm(span=10).mean()
df["2weekSignal_50-100day"] = df["MACD_50-100day"].ewm(span=10).mean()

In [18]:
df['RSI_2day'] = RSI(2)
df['RSI_5day'] = RSI(5)
df['RSI_10day'] = RSI(10)
df['RSI_14day'] = RSI(14)
df['RSI_20day'] = RSI(20)
df['RSI_50day'] = RSI(50)
df['RSI_100day'] = RSI(100)

In [19]:
df["High_5day"] = df["High"].rolling(5).max()
df["High_10day"] = df["High"].rolling(10).max()
df["High_20day"] = df["High"].rolling(20).max()
df["High_50day"] = df["High"].rolling(50).max()
df["High_100day"] = df["High"].rolling(100).max()

In [20]:
df["Low_5day"] = df["Low"].rolling(5).min()
df["Low_10day"] = df["Low"].rolling(10).min()
df["Low_20day"] = df["Low"].rolling(20).min()
df["Low_50day"] = df["Low"].rolling(50).min()
df["Low_100day"] = df["Low"].rolling(100).min()

In [21]:
df["VolumeAVG_5day"] = df["Volume"].rolling(5).mean()
df["VolumeAVG_10day"] = df["Volume"].rolling(10).mean()
df["VolumeAVG_20day"] = df["Volume"].rolling(20).mean()
df["VolumeAVG_50day"] = df["Volume"].rolling(50).mean()
df["VolumeAVG_100day"] = df["Volume"].rolling(100).mean()

In [22]:
df["YCloseToTOpen"] = df["Close"].shift(1) / df["Open"]
df["YCloseToTOpen_5day"] = df["YCloseToTOpen"].rolling(5).mean()
df["YCloseToTOpen_10day"] = df["YCloseToTOpen"].rolling(10).mean()
df["YCloseToTOpen_20day"] = df["YCloseToTOpen"].rolling(20).mean()
df["YCloseToTOpen_50day"] = df["YCloseToTOpen"].rolling(50).mean()
df["YCloseToTOpen_100day"] = df["YCloseToTOpen"].rolling(100).mean()

In [23]:
df["close_to_open_5day"] = df["close_to_open"].rolling(5).mean()
df["close_to_open_10day"] = df["close_to_open"].rolling(10).mean()
df["close_to_open_20day"] = df["close_to_open"].rolling(20).mean()
df["close_to_open_50day"] = df["close_to_open"].rolling(50).mean()
df["close_to_open_100day"] = df["close_to_open"].rolling(100).mean()

In [24]:
df["close_to_high_5day"] = df["close_to_high"].rolling(5).mean()
df["close_to_high_10day"] = df["close_to_high"].rolling(10).mean()
df["close_to_high_20day"] = df["close_to_high"].rolling(20).mean()
df["close_to_high_50day"] = df["close_to_high"].rolling(50).mean()
df["close_to_high_100day"] = df["close_to_high"].rolling(100).mean()

In [25]:
df["close_to_low_5day"] = df["close_to_low"].rolling(5).mean()
df["close_to_low_10day"] = df["close_to_low"].rolling(10).mean()
df["close_to_low_20day"] = df["close_to_low"].rolling(20).mean()
df["close_to_low_50day"] = df["close_to_low"].rolling(50).mean()
df["close_to_low_100day"] = df["close_to_low"].rolling(100).mean()

In [26]:
df["stochastic_oscillator_5day"] = ((df["Close"].shift(1) - df["Low_5day"]) / (df["High_5day"] - df["Low_5day"]))
df["stochastic_oscillator_10day"] = ((df["Close"].shift(1) - df["Low_10day"]) / (df["High_10day"] - df["Low_10day"]))
df["stochastic_oscillator_20day"] = ((df["Close"].shift(1) - df["Low_20day"]) / (df["High_20day"] - df["Low_20day"]))
df["stochastic_oscillator_50day"] = ((df["Close"].shift(1) - df["Low_50day"]) / (df["High_50day"] - df["Low_50day"]))
df["stochastic_oscillator_100day"] = ((df["Close"].shift(1) - df["Low_100day"]) / (df["High_100day"] - df["Low_100day"]))

In [27]:
df["openSTD_5day"] = df["Open"].rolling(5).std()
df["openSTD_10day"] = df["Open"].rolling(10).std()
df["openSTD_20day"] = df["Open"].rolling(20).std()
df["openSTD_50day"] = df["Open"].rolling(50).std()
df["openSTD_100day"] = df["Open"].rolling(100).std()

In [28]:
df["closeSTD_5day"] = df["Close"].rolling(5).std()
df["closeSTD_10day"] = df["Close"].rolling(10).std()
df["closeSTD_20day"] = df["Close"].rolling(20).std()
df["closeSTD_50day"] = df["Close"].rolling(50).std()
df["closeSTD_100day"] = df["Close"].rolling(100).std()

In [30]:
df["OBV"] = 0
df["OBV"] = OBV()    

In [33]:
BollingerBand(5)
BollingerBand(10)
BollingerBand(20)
BollingerBand(50)
BollingerBand(100)

In [54]:
df["BollingerBand_5day2stdDiff"] = df["BollingerBand_Up_5_2std"] - df["BollingerBand_down_5_2std"]
df["BollingerBand_5day1stdDiff"] = df["BollingerBand_Up_5_1std"] - df["BollingerBand_down_5_1std"]
df["BollingerBand_10day2stdDiff"] = df["BollingerBand_Up_10_2std"] - df["BollingerBand_down_10_2std"]
df["BollingerBand_10day1stdDiff"] = df["BollingerBand_Up_10_1std"] - df["BollingerBand_down_10_1std"]
df["BollingerBand_20day2stdDiff"] = df["BollingerBand_Up_20_2std"] - df["BollingerBand_down_20_2std"]
df["BollingerBand_20day1stdDiff"] = df["BollingerBand_Up_20_1std"] - df["BollingerBand_down_20_1std"]
df["BollingerBand_50day2stdDiff"] = df["BollingerBand_Up_50_2std"] - df["BollingerBand_down_50_2std"]
df["BollingerBand_50day1stdDiff"] = df["BollingerBand_Up_50_1std"] - df["BollingerBand_down_50_1std"]
df["BollingerBand_100day2stdDiff"] = df["BollingerBand_Up_100_2std"] - df["BollingerBand_down_100_2std"]
df["BollingerBand_100day1stdDiff"] = df["BollingerBand_Up_100_1std"] - df["BollingerBand_down_100_1std"]

In [57]:
#drop unwanted dates for final data set
ranges = np.arange(0,199)
out = df.drop(ranges)
out = out.reset_index(drop=True)
out = out.drop([2476,2477])

In [60]:
#save data
out.to_csv('ExtrackedStockData.csv')