# DATA-EXTRACTION

## IMPORTS

In [149]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

## FUNCTIONS

In [150]:
def fetch_yfinnace(stock, startDate, endDate, interval='1d'):
    print(stock)
    df = yf.download(stock, start=startDate, end=endDate, interval=interval, group_by='tickers')
    df = df.drop(columns=['Adj Close'])
    return df

In [151]:
def create_label(df, timeframe=-1):
    df['Label'] = df["Close"].shift(timeframe)
    return df

In [152]:
def split_data(percentage, df, dropLabels):
    splitValue = int(len(df) * percentage) # Used for calculating the split for test train based on data size
    
    x = np.array(df.drop(dropLabels, 1)) # Drop all unused columns
    y = np.ravel(df.Label) # take only labels

    x_train, x_test = x[:splitValue], x[splitValue:]
    y_train, y_test = y[:splitValue], y[splitValue:]
    
    return x_train, x_test, y_train, y_test

In [153]:
def MM_Scaler(x_train, x_test):

    linearScaler = MinMaxScaler().fit(x_train)

    # Scale train and test
    x_train = linearScaler.transform(x_train)
    x_test = linearScaler.transform(x_test)
    return x_train, x_test

In [154]:
def normalization():
    return null

In [155]:
def standardization(x_train, x_test):
    scaler = StandardScaler().fit(x_train)
    
    # Scale train and test
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    return x_train, x_test

### DATA FEATURES

Data features are based on this study: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4873195/

### TYPE 1 FEATURES

In [156]:
# Stochastic %K
def stochastic_k(df, timeframe=14):
    return df.join(pd.Series(100 * ((df['Close'] - df['Low'].rolling(timeframe).min()) / (df['High'].rolling(timeframe).max() - df['Low'].rolling(timeframe).min())), name='stocK'))

In [157]:
# Stochastic %D
def stochastic_d(df, timeframe=3):
    return df.join(pd.Series(df['stocK'].rolling(timeframe).mean(), name='stocD'))

In [158]:
# Stochastic slow %D
def stochastic_sd(df, timeframe=3):
    return df.join(pd.Series(df['stocD'].rolling(timeframe).mean(), name='stocSD'))

In [159]:
def momentum(df, timeframe=14):
    return df.join(pd.Series(df['Close'].diff(timeframe), name='Momentum'))

In [160]:
#TODO
# Rate of change
def rate_of_change(df, timeframe):
    return null

In [161]:
#TODO
# Larry William's %R
def larry_williams(df, timeframe):
    return null

In [162]:
#TODO
# A/O Oscillator (accumulation/distribution oscillator)
def ao_oscillator(df, timeframe):
    return null

In [163]:
#TODO
# Disparity
def disparity(df, timeframe):
    return null

In [164]:
#TODO
# Price oscillator
def price_oscillator(df, timeframe):
    return null

In [165]:
#TODO
# Commodity channel index
def commodity_channel_index(df, timeframe):
    return null

In [166]:
#TODO
# Relative strength index
def relative_strength_index(df, timeframe):
    return null

In [167]:
# Feature addition function
def add_type1_features(df, timeframe):
    df = stochastic_k(df, timeframe)
    df = stochastic_d(df)  #Default as 3 
    df = stochastic_sd(df) #Default as 3 
    df = momentum(df, timeframe)
    # Rate of Change
    # Larry William's %R
    # A/O Oscillator
    # Disparity in 5 d
    # Disparity in 10 days
    # Price oscillator
    # CCI
    # RSI
    return df

### TYPE 2 FEATURES

In [168]:
#TODO
# OBV
def obv(df, timeframe):
    return null

In [169]:
# Moving Average
def moving_average(df, timeframe):
    return df.join(pd.Series(df["Close"].rolling(timeframe).sum()/timeframe, name='MA'))

In [170]:
#TODO
#BIAS6
def bias():
    return null

In [171]:
#TODO
# PSY
def psy(df, timeframe):
    return null

In [172]:
# SY
def sy(df):
    return df.join(pd.Series((np.log(df['Close']) - np.log(df['Close'].shift(1))) * 100 , name='SY'))

In [173]:
# ASY
def asy(df, timeframe):
    return df.join(pd.Series(df["SY"].rolling(timeframe).sum()/timeframe, name='ASY' + str(timeframe)))

In [174]:
#TODO
# Feature addition function
def add_type2_features(df, timeframe):
    # OBV
    df = moving_average(df, 5)
    # BIAS6
    # PSY
    df = sy(df)
    df = asy(df, 5)
    df = asy(df, 4)
    df = asy(df, 3)
    df = asy(df, 2)
    df = asy(df, 1)
    return df

## STEP 1: LOAD DATA

In [175]:
data = fetch_yfinnace('NVDA', startDate='2019-01-08', endDate='2020-01-08', interval='1d')

NVDA
[*********************100%***********************]  1 of 1 completed


In [176]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-07,138.5,144.889999,136.429993,143.399994,17729000
2019-01-08,146.690002,146.779999,136.899994,139.830002,19650400
2019-01-09,141.899994,144.490005,139.860001,142.580002,15431500
2019-01-10,141.800003,145.580002,139.360001,145.229996,13078900
2019-01-11,144.330002,149.75,143.210007,148.830002,21869100


In [177]:
# On average there are ~ 251 trading days per year, however, it might vary a little depending on the year.
data.shape

(253, 5)

## STEP 2: ADD LABEL AND FEATURES

In [178]:
data = create_label(data)
data = add_type1_features(data, 14)
data = add_type2_features(data, 14)
data = data.dropna()

In [179]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Label,stocK,stocD,stocSD,Momentum,MA,SY,ASY5,ASY4,ASY3,ASY2,ASY1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019-01-31,137.259995,145.190002,136.380005,143.75,21071300,144.729996,42.670676,22.021422,26.155683,-1.479996,142.179999,4.525209,-1.870123,-2.700879,1.358319,4.415431,4.525209
2019-02-01,144.5,146.789993,142.580002,144.729996,15626200,149.179993,45.950447,36.668886,24.769444,-4.100006,139.095999,0.679423,-2.024819,1.188595,3.170095,2.602316,0.679423
2019-02-04,145.369995,150.679993,144.479996,149.179993,13214800,149.949997,60.843339,49.821487,36.170598,-1.26001,141.329999,3.028367,1.556549,3.134663,2.744333,1.853895,3.028367
2019-02-05,149.660004,151.429993,148.300003,149.949997,13560600,153.0,63.420327,56.738038,47.742804,0.080002,144.999997,0.51483,2.610696,2.186957,1.40754,1.771599,0.51483
2019-02-06,151.289993,155.600006,151.070007,153.0,17561600,147.419998,73.627833,65.963833,57.507786,4.160004,148.121997,2.013604,2.152287,1.559056,1.852267,1.264217,2.013604


## STEP 3: NORMALIZE AND SPLIT DATA

In [180]:
x_train, x_test, y_train, y_test = split_data(.8, data, ['Label', 'Open', 'High', 'Low' , 'Volume', 'stocK', 'stocD',
                                                         'stocSD','Momentum', "MA", 'SY', "ASY5", "ASY4", "ASY3", "ASY2", 
                                                         "ASY1"])

In [181]:
x_train, x_test = MM_Scaler(x_train, x_test)

In [182]:
# x_train, x_test = standardization(x_train, x_test)

In [185]:
print(x_train[0], x_test[0])

[-1.55489797] [2.26094034]
