In [16]:
import pandas as pd
import pandas_datareader as pdr
import datetime
import yfinance as yf
import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
np.set_printoptions(suppress=True)

#FEATURES
#Prediction? Number of days trending upwards? Percentage increase?
#Sell after a 5% increase?
#Dissect what all of this means.
#I want to get a precision / recall curve
#Set rounding threshold lower
#Confidence variables?
#Try other models?
#Try other stocks besides stocks that tend to just go up?
#Predict rise in values rather than just simple classification?
#Set rise rate higher / faster?
#Invest money based on confidence / probability?
#We call more about precision & accuracy than recall
#Remove unnecessary features, prevent overfitting?
#Paper trade over previous data...see how we would have fared?
#Automate all of this?
#Format notebook
#Create github
#Check my math & calculations?
#Train, test, validate data (training data, testing data, stock not involved in the training) Validate on CSCO
#NaN values a problem?
#Overfitting based on general bullish tech stocks?
#Rise 5% within 10 days or AT 10 days?
#get model size / kernel size

#BUGS
#RSI 5 sometimes is NaN?
#Check calculations: 9 or 10 days? Including or not including?
#Normalze MACD?

In [17]:
stock_symbols = ["AAPL", "MSFT", "TSLA", "GOOG", "AMZN", "FB", "BABA", "PYPL", "INTC", "CRM", "AMD", "ATVI", "MTCH", "EA", "TTD", "ZG", "YELP", "NVDA"]
stock_data = pd.DataFrame(columns=["Ticker","Date","Open","High","Low","Close"])

In [18]:
for stock in stock_symbols:
    y_finance_data = yf.Ticker(stock).history(start=(datetime.date.today()-datetime.timedelta(days=1825)).strftime("%Y-%m-%d")).reset_index()
    y_finance_data.insert(0,"Ticker", stock)
    y_finance_data = y_finance_data[['Ticker', 'Date','Open','High','Low','Close']]
    stock_data = pd.concat([stock_data, y_finance_data])
stock_data = stock_data.rename(columns={"Ticker":"TICKER", "Date":"DATE", "Open":"OPEN", "High":"HIGH", "Low":"LOW", "Close":"CLOSE"})

In [19]:
def RSI_calc(start, lookback, df, date_time=True):
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]]
        
        df['GAIN_LOSS'] = df.apply(lambda row : (row["CLOSE"]-row["OPEN"]) / row["OPEN"] * 100, axis=1)
        avg_gain = df.loc[df['GAIN_LOSS'] >= 0]["GAIN_LOSS"].mean()
        avg_loss = df.loc[df['GAIN_LOSS'] < 0]["GAIN_LOSS"].mean() * -1
        RSI = 100 - (100 / (1 + (avg_gain/avg_loss)))
        return RSI

In [20]:
def daily_return(start, lookback, df, date_time=True):
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1] #Include day?
        df['daily_change'] = df.apply(lambda row: ((row['CLOSE'] - row['OPEN']) / row['OPEN']) * 100, axis=1)
        avg_daily_return = df['daily_change'].mean()
        return avg_daily_return

In [21]:
def weekly_return(start, lookback, df, date_time=True): #Check math here!!
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1] #Include day?
        df = df.iloc[::5, :]
        df = df.reset_index()
        weekly_change_sum = 0
        for i in range(1, len(df)): #Would really like to not be hacky about this...but oh well
            weekly_change_sum += ((df.iloc[i]['CLOSE'] - df.iloc[i-1]['CLOSE']) / df.iloc[i-1]['OPEN']) * 100
        weekly_change_avg = weekly_change_sum / len(df)-1
        return weekly_change_avg

In [22]:
def monthly_return(start, lookback, df, date_time=True):
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        if(lookback < 30):
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1] #Include day?
        df = df.iloc[::30, :]
        df = df.reset_index()
        monthly_change_sum = 0
        for i in range(1, len(df)): #Would really like to not be hacky about this...but oh well
            monthly_change_sum += ((df.iloc[i]['CLOSE'] - df.iloc[i-1]['CLOSE']) / df.iloc[i-1]['OPEN']) * 100
        monthly_change_avg = monthly_change_sum / len(df)-1
        return monthly_change_avg

In [23]:
def EMA(start, lookback, df):
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1]
        df = df.reset_index()
        SMA = (df['CLOSE'][0:lookback-1].sum()) / lookback
        k = 2 / (lookback + 1)
        EMA = (df.loc[df['DATE']==start]['CLOSE'] * k) + (SMA * (1-k))
        return EMA
    
def MACD(start, lookback1, lookback2, df, date_time=True):
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback2) < 0:
            return None
        EMA_short = EMA(start, lookback1, df)
        EMA_long = EMA(start, lookback2, df)
        return float(EMA_short) - float(EMA_long)

In [24]:
#df = stock_data.loc[stock_data["TICKER"]=="AAPL"]
#MACD("2020-06-30", 10, 30, df, False)

In [25]:
def ratio_avg_to_close(start, lookback, df, date_time=True): #Check math here!!
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1]
        df = df.reset_index()
        mean_close = df[0:lookback]['CLOSE'].sum() / lookback
        ratio = mean_close / df.iloc[lookback]['CLOSE']
        return ratio

In [26]:
def target(start, look_forward, df, target, date_time=True):
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])+look_forward) > len(df): #this works
            return None
        df = df[(df.index[df["DATE"]==start][0]):df.index[df["DATE"]==start][0]+look_forward] #+1? 9 or 10 days?
        df = df.reset_index()
        percent_increase = (df.iloc[look_forward-1]['CLOSE'] - df.iloc[0]['CLOSE']) / df.iloc[0]['CLOSE'] * 100
        if percent_increase >= target:
            return True
        else:
            return False

In [27]:
df_list = []
for stock in stock_symbols:
    df = stock_data.loc[stock_data["TICKER"]==stock]
    for num in [5,10,30,60]:
        df['RSI ' + str(num)] = df.apply(lambda row: RSI_calc(row['DATE'], num, df, True), axis=1)
        df["AVG DAILY RETURN " + str(num)] = df.apply(lambda row: daily_return(row['DATE'], num, df, True), axis=1)
        df["AVG WEEKLY RETURN " + str(num)] = df.apply(lambda row: weekly_return(row['DATE'], num, df, True), axis=1)
        df["AVG MONTHLY RETURN " + str(num)] = df.apply(lambda row: monthly_return(row['DATE'], num, df, True), axis=1)
        df["RATIO " + str(num)] = df.apply(lambda row: ratio_avg_to_close(row['DATE'], num, df, True), axis=1)
    df["MACD 10_30"] = df.apply(lambda row: MACD(row['DATE'], 10, 30, df, True), axis=1)
    df["MACD 5_10"] = df.apply(lambda row: MACD(row['DATE'], 5, 10, df, True), axis=1)
    df["MACD 2_10"] = df.apply(lambda row: MACD(row['DATE'], 2, 10, df, True), axis=1)
    df['MACD 10_30 DIFF'] = df['MACD 10_30'].diff()
    df['MACD 5_10 DIFF'] = df['MACD 5_10'].diff()
    df['MACD 2_10 DIFF'] = df['MACD 2_10'].diff()
    df["TARGET"] = df.apply(lambda row: target(row['DATE'], 10, df, 5, True), axis=1)
    df_list.append(df)
stock_data = pd.concat(df_list)
stock_data['TARGET'].value_counts(normalize=True) * 100
stock_data.to_excel(input("BIG STONK")+".xlsx")

BIG STONKBIG STONK


In [28]:
print(stock_data['TARGET'].value_counts(normalize=True) * 100)

import copy
stock_data_backup = copy.deepcopy(stock_data)

stock_data = stock_data.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
stock_data = stock_data.dropna()

input_patterns = stock_data.loc[:,'RSI 5':'MACD 2_10 DIFF'][60:len(stock_data)-10]
input_label = stock_data['TARGET'][60:len(stock_data)-10].astype(int)
print(stock_data)

In [29]:

accuracy_trials = []
recall_trials = []
precision_trials = []
average_precision_trials = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = .33)
    model = XGBClassifier(eval_metric="logloss")
    model.fit(X_train, y_train)
    accuracy_trials.append(accuracy_score(y_test, model.predict(X_test)) * 100.0)
    precision_trials.append(precision_score(y_test, model.predict(X_test)) * 100.0)
    recall_trials.append(recall_score(y_test, model.predict(X_test)) * 100.0)
    average_precision_trials.append(average_precision_score(y_test, model.predict(X_test)))
    
mean_accuracy = sum(accuracy_trials) / len(accuracy_trials)
mean_precision = sum(precision_trials) / len(precision_trials)
mean_recall= sum(recall_trials) / len(recall_trials)
mean_avg_precision= sum(average_precision_trials) / len(average_precision_trials)

print("Mean Accuracy: {}%".format(mean_accuracy))
print("Mean Precision: {}%".format(mean_precision))
print("Mean Recall: {}%".format(mean_recall))
print(model)

''' #FIX
print("Mean Avg. Precision: {}%".format(mean_avg_precision))
disp = precision_recall_curve(X_test, y_test)
disp.ax_.set_title('2-class Precision-Recall curve: '
                   'AP={0:0.2f}'.format(mean_avg_precision))\
disp.plot()
'''



Mean Accuracy: 78.0747918243755%
Mean Precision: 62.87183633810198%
Mean Recall: 26.501081675123004%
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)


' #FIX\nprint("Mean Avg. Precision: {}%".format(mean_avg_precision))\ndisp = precision_recall_curve(X_test, y_test)\ndisp.ax_.set_title(\'2-class Precision-Recall curve: \'\n                   \'AP={0:0.2f}\'.format(mean_avg_precision))disp.plot()\n'

In [30]:
'''
#Probabilities
X_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = .33)
model = XGBClassifier(objective="binary:logistic", eval_metric="logloss")
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)
print(y_pred)

#Predictions
X_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = .33)
model = XGBClassifier(objective="binary:logistic", eval_metric="logloss")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(y_pred)
'''

'\n#Probabilities\nX_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = .33)\nmodel = XGBClassifier(objective="binary:logistic", eval_metric="logloss")\nmodel.fit(X_train, y_train)\ny_pred = model.predict_proba(X_test)\nprint(y_pred)\n\n#Predictions\nX_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = .33)\nmodel = XGBClassifier(objective="binary:logistic", eval_metric="logloss")\nmodel.fit(X_train, y_train)\ny_pred = model.predict(X_test)\nprint(y_pred)\n'

In [31]:
stock_data

Unnamed: 0,TICKER,DATE,OPEN,HIGH,LOW,CLOSE,RSI 5,AVG DAILY RETURN 5,AVG WEEKLY RETURN 5,RATIO 5,...,AVG WEEKLY RETURN 60,AVG MONTHLY RETURN 60,RATIO 60,MACD 10_30,MACD 5_10,MACD 2_10,MACD 10_30 DIFF,MACD 5_10 DIFF,MACD 2_10 DIFF,TARGET
60,AAPL,2016-06-15,22.884297,23.022324,22.699482,22.725216,42.646710,-0.088003,-1.908914,1.013424,...,-1.571687,-3.473605,1.038941,-0.690940,-1.211967,-2.155069,-0.088142,-0.015823,-0.046157,False
61,AAPL,2016-06-16,22.563796,22.867922,22.474898,22.821135,34.303334,0.115543,-2.065985,1.005474,...,-1.589437,-3.583898,1.033185,-0.719664,-1.224458,-2.084743,-0.028724,-0.012491,0.070326,False
62,AAPL,2016-06-17,22.603571,22.610589,22.294767,22.301785,33.924306,-0.301564,-2.776096,1.024483,...,-1.686424,-4.116329,1.055755,-0.817365,-1.370338,-2.337924,-0.097701,-0.145880,-0.253181,False
63,AAPL,2016-06-20,22.458527,22.591875,22.231602,22.247978,36.180652,-0.508560,-2.134847,1.019600,...,-1.664452,-4.064700,1.056528,-0.862200,-1.411335,-2.340943,-0.044835,-0.040996,-0.003019,False
64,AAPL,2016-06-21,22.210543,22.540401,22.149717,22.437468,39.362519,-0.110289,-1.796334,1.006318,...,-1.586261,-3.603432,1.045880,-0.922177,-1.382156,-2.272464,-0.059977,0.029179,0.068479,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1242,NVDA,2021-02-25,561.820007,574.989990,528.650024,532.299988,42.182176,-0.820871,-6.164980,1.093402,...,-0.892103,-0.862682,1.019859,-3.232089,-45.461495,-84.432159,-6.636667,-10.899062,-24.136072,False
1243,NVDA,2021-02-26,550.099976,554.179993,534.440002,548.580017,35.000256,-0.980077,-5.033411,1.038766,...,-0.723018,-0.211081,0.989654,-3.455584,-43.886727,-73.290717,-0.223495,1.574767,11.141442,False
1244,NVDA,2021-03-01,555.000000,557.000000,542.130005,553.669983,36.006720,-0.911307,-2.739454,1.011704,...,-0.675840,0.116048,0.980938,-8.938541,-45.390458,-72.406463,-5.482957,-1.503730,0.884254,False
1245,NVDA,2021-03-02,556.000000,556.820007,535.840027,536.250000,44.928704,-1.030680,-3.613630,1.036901,...,-0.960449,-1.323054,1.013370,-15.703643,-47.369572,-74.058905,-6.765102,-1.979114,-1.652442,False
