# DATA-EXTRACTION

## IMPORTS

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

## FUNCTIONS

In [2]:
def fetch_yfinnace(stock, startDate, endDate, interval='1d'):
    print(stock)
    df = yf.download(stock, start=startDate, end=endDate, interval=interval, group_by='tickers')
    df = df.drop(columns=['Adj Close'])
    return df

In [3]:
def create_label(df, timeframe=-1):
    df['Label'] = df['Close'].shift(timeframe)
    return df

In [4]:
def split_data(percentage, df, dropLabels):
    splitValue = int(len(df) * percentage) # Used for calculating the split for test train based on data size
    
    x = np.array(df.drop(dropLabels, 1)) # Drop all unused columns
    y = np.ravel(df.Label) # take only labels

    x_train, x_test = x[:splitValue], x[splitValue:]
    y_train, y_test = y[:splitValue], y[splitValue:]
    
    return x_train, x_test, y_train, y_test

In [5]:
def MM_Scaler(x_train, x_test):

    linearScaler = MinMaxScaler().fit(x_train)

    # Scale train and test
    x_train = linearScaler.transform(x_train)
    x_test = linearScaler.transform(x_test)
    return x_train, x_test

In [6]:
def standardization(x_train, x_test):
    scaler = StandardScaler().fit(x_train)
    
    # Scale train and test
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    return x_train, x_test

### DATA FEATURES

Data features are based on this study: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4873195/

### TYPE 1 FEATURES

In [7]:
# Stochastic %K
def stochastic_k(df, timeframe=14):
    return df.join(pd.Series(100 * ((df['Close'] - df['Low'].rolling(timeframe).min()) / (df['High'].rolling(timeframe).max() - df['Low'].rolling(timeframe).min())), name='stocK'))

In [8]:
# Stochastic %D
def stochastic_d(df, timeframe=3):
    return df.join(pd.Series(df['stocK'].rolling(timeframe).mean(), name='stocD'))

In [9]:
# Stochastic slow %D
def stochastic_sd(df, timeframe=3):
    return df.join(pd.Series(df['stocD'].rolling(timeframe).mean(), name='stocSD'))

In [10]:
# Momentum
def momentum(df, timeframe=14):
    return df.join(pd.Series(df['Close'].diff(timeframe), name='Momentum'))

In [11]:
# Rate of change
def rate_of_change(df, timeframe):  
    return df.join(pd.Series( ((df["Close"] / df["Close"].shift(timeframe)) -1) * 100 , name='ROC') )

In [12]:
# Larry William's %R
def larry_williams(df, timeframe):
    df['Ln'] = df['Low'].rolling(window=timeframe).min()
    df['Hn'] = df['High'].rolling(window=timeframe).max()  
    return df.join(pd.Series( ( df['Hn'] -df['Close'] ) / (df['Hn'] - df['Ln']) * 100 , name='LWR')).drop(columns=['Ln', 'Hn'])

In [13]:
# A/O Oscillator (accumulation/distribution oscillator)
def ao_oscillator(df):
    return df.join(pd.Series((df['High'] - df['Close'].shift(1) ) / (df['High'] - df['Low']) , name='AOosci'))

In [14]:
# Disparity
def disparity(df, timeframe):
    return df.join(pd.Series(df['Close'] / (df['Close'].rolling(timeframe).sum()/timeframe) * 100, name='Disp' + str(timeframe) ))

In [15]:
# Price oscillator
def price_oscillator(df):
    df['MA5'] = df['Close'].rolling(5).sum()/5
    df['MA10'] = df['Close'].rolling(10).sum()/10
    return df.join(pd.Series(df['MA5'] - df['MA10']/df['MA5'], name='OSCP')).drop(columns=['MA5', 'MA10'])

In [16]:
# Commodity channel index
def CCI(df, timeframe):
    TP = (df['High'] + df['Low'] + df['Close']) / 3 
    return df.join(pd.Series((TP - TP.rolling(timeframe).mean()) / (0.015 * TP.rolling(timeframe).std()), name='CCI'))

In [17]:
# Relative strength index
def RSI(df, timeframe):
    chg = df['Close'].diff(1)
    gain = chg.mask(chg<0,0)
    #data['gain'] = gain
    loss = chg.mask(chg>0,0)
    #data['loss'] = loss
    avg_gain = gain.ewm(com = timeframe - 1, min_periods = timeframe).mean()
    avg_loss = loss.ewm(com = timeframe - 1, min_periods = timeframe).mean()
    #data['avg_gain'] = avg_gain
    #data['avg_loss'] = avg_loss
    rs = abs(avg_gain/avg_loss)
    rsi = 100-(100/(1+rs))
    
    return df.join(pd.Series(rsi,name='RSI'))

In [18]:
# Feature addition function
def add_type1_features(df, timeframe):
    df = stochastic_k(df, timeframe)
    df = stochastic_d(df)  # Default as 3 
    df = stochastic_sd(df) # Default as 3 
    df = momentum(df, timeframe)
    df = rate_of_change(df, timeframe)
    df = larry_williams(df, timeframe)
    df = ao_oscillator(df) # No timeframe
    df = disparity(df, 5) # Default as 5 
    df = disparity(df, 10) # Default as 10 
    df = price_oscillator(df)
    df = CCI(df, timeframe)
    df = RSI(df, timeframe)
    return df

### TYPE 2 FEATURES

In [19]:
# OBV
def obv(df):
    df['OBV'] = np.where(df['Close'] > df['Close'].shift(1), df['Volume'], np.where(df['Close'] < df['Close'].shift(1), -df['Volume'], 0)).cumsum()
    return df

In [20]:
# Moving Average
def moving_average(df, timeframe):
    return df.join(pd.Series(df['Close'].rolling(timeframe).sum()/timeframe, name='MA'))

In [21]:
# BIAS 
def bias(df, timeframe):
    return df.join(pd.Series(( (df['Close'] - (df['Close'].rolling(timeframe).sum()/timeframe) ) / (df['Close'].rolling(timeframe).sum()/timeframe) ) , name='BIAS' + str(timeframe)))

In [22]:
# PSY - ratio of the number of rising periods over the n day period
def psy(df, timeframe):    
    df['A'] = np.where(df['Close'].shift(-1) - df['Close'] >0, 1, 0)
    return df.join(round(pd.Series(df['A'].rolling(timeframe).sum()/timeframe * 100, name='PSY' + str(timeframe)),1)).drop(columns=['A'])

In [23]:
# SY
def sy(df):
    return df.join(pd.Series((np.log(df['Close']) - np.log(df['Close'].shift(1))) * 100 , name='SY'))

In [24]:
# ASY
def asy(df, timeframe):
    return df.join(pd.Series(df['SY'].rolling(timeframe).sum()/timeframe, name='ASY' + str(timeframe)))

In [25]:
# Feature addition function
def add_type2_features(df, timeframe):
    df = obv(df)
    df = moving_average(df, 5)
    df = bias(df, 6) # Default value 6
    df = psy(df, 12) # Default value 12
    df = sy(df)
    df = asy(df, 5) # Default value 5
    df = asy(df, 4) # Default value 4
    df = asy(df, 3) # Default value 3
    df = asy(df, 2) # Default value 2
    df = asy(df, 1) # Default value 1
    df = df.drop(columns=['SY']) #USED ONLY FOR ASY CALCULATIONS
    return df

## STEP 1: LOAD DATA

In [26]:
source = fetch_yfinnace('NVDA', startDate='2019-01-08', endDate='2020-01-08', interval='1d')

NVDA
[*********************100%***********************]  1 of 1 completed


In [27]:
source.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-07,138.5,144.889999,136.429993,143.399994,17729000
2019-01-08,146.690002,146.779999,136.899994,139.830002,19650400
2019-01-09,141.899994,144.490005,139.860001,142.580002,15431500
2019-01-10,141.800003,145.580002,139.360001,145.229996,13078900
2019-01-11,144.330002,149.75,143.210007,148.830002,21869100


In [28]:
# On average there are ~ 251 trading days per year, however, it might vary a little depending on the year.
source.shape

(253, 5)

## STEP 2: ADD LABEL AND FEATURES

In [29]:
data = create_label(source)
data = add_type1_features(data, 14)
data = add_type2_features(data, 14)
data = data.dropna()

In [30]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Label,stocK,stocD,stocSD,Momentum,...,RSI,OBV,MA,BIAS6,PSY12,ASY5,ASY4,ASY3,ASY2,ASY1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-31,137.259995,145.190002,136.380005,143.75,21071300,144.729996,42.670676,22.021422,26.155683,-1.479996,...,48.256657,49040500,142.179999,-0.007183,66.7,-1.870123,-2.700879,1.358319,4.415431,4.525209
2019-02-01,144.5,146.789993,142.580002,144.729996,15626200,149.179993,45.950447,36.668886,24.769444,-4.100006,...,49.183378,64666700,139.095999,0.014901,75.0,-2.024819,1.188595,3.170095,2.602316,0.679423
2019-02-04,145.369995,150.679993,144.479996,149.179993,13214800,149.949997,60.843339,49.821487,36.170598,-1.26001,...,53.275596,77881500,141.329999,0.059693,75.0,1.556549,3.134663,2.744333,1.853895,3.028367
2019-02-05,149.660004,151.429993,148.300003,149.949997,13560600,153.0,63.420327,56.738038,47.742804,0.080002,...,53.966386,91442100,144.999997,0.050315,75.0,2.610696,2.186957,1.40754,1.771599,0.51483
2019-02-06,151.289993,155.600006,151.070007,153.0,17561600,147.419998,73.627833,65.963833,57.507786,4.160004,...,56.697306,109003700,148.121997,0.045558,75.0,2.152287,1.559056,1.852267,1.264217,2.013604


## STEP 3: NORMALIZE AND SPLIT DATA

In [31]:
x_train, x_test, y_train, y_test = split_data(.8, data, ['Label', 'Open', 'High', 'Low' , 'Volume', 'stocK', 'stocD',
                                                         'stocSD','Momentum', 'ROC', 'LWR', 'AOosci', 'Disp5', 'Disp10', 
                                                         'OSCP', 'CCI', 'RSI', 'OBV', 'MA', 'BIAS6', 'PSY12', 'ASY5', 
                                                         'ASY4', 'ASY3', 'ASY2', 'ASY1'])

In [32]:
x_train, x_test = MM_Scaler(x_train, x_test)

In [33]:
# x_train, x_test = standardization(x_train, x_test)

In [34]:
print(x_train[0], x_test[0])

[0.13655666] [0.94699363]
