In [754]:
import pandas as pd
import pandas_datareader as pdr
import datetime
import yfinance as yf
import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import matplotlib.pyplot as plt
import xlrd
from featexp import get_univariate_plots
from joblib import dump, load
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
from matplotlib import pyplot
from numpy import sort
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
import copy
from sklearn.model_selection import cross_validate

%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4


pd.options.mode.chained_assignment = None
np.set_printoptions(suppress=True)

#FEATURES
#Prediction? Number of days trending upwards? Percentage increase?
#Sell after a 5% increase?
#Dissect what all of this means.
#I want to get a precision / recall curve
#Set rounding threshold lower
#Confidence variables?
#Try other models?
#Try other stocks besides stocks that tend to just go up?
#Predict rise in values rather than just simple classification?
#Set rise rate higher / faster?
#Invest money based on confidence / probability?
#We call more about precision & accuracy than recall
#Remove unnecessary features, prevent overfitting?
#Paper trade over previous data...see how we would have fared?
#Automate all of this?
#NaN values a problem?
#Overfitting based on general bullish tech stocks?
#Rise 5% within 10 days or AT 10 days?
#get model size / kernel size


#TODOS
#check calculations. RSI in particular. 9 or 10 days?
#Understand features, overfitting. early_stopping? logloss vs roc_auc? 
#Normalize MACD? Normalize all values to be below 1?
#Try different model? cat one?
#Change train, test, validate to 2 yrs, 1 yr, 1yr
#AUC vs Precision as metric?

In [755]:
def RSI_calc(start, lookback, df, date_time=True): #Calculate RSI for a given day
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]]
        
        df['GAIN_LOSS'] = df.apply(lambda row : (row["CLOSE"]-row["OPEN"]) / row["OPEN"] * 100, axis=1)
        avg_gain = df.loc[df['GAIN_LOSS'] >= 0]["GAIN_LOSS"].mean()
        avg_loss = df.loc[df['GAIN_LOSS'] < 0]["GAIN_LOSS"].mean() * -1
        RSI = 100 - (100 / (1 + (avg_gain/avg_loss)))
        return RSI

In [756]:
def daily_return(start, lookback, df, date_time=True): #Calculate daily return
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1] #Include day?
        df['daily_change'] = df.apply(lambda row: ((row['CLOSE'] - row['OPEN']) / row['OPEN']) * 100, axis=1)
        avg_daily_return = df['daily_change'].mean()
        return avg_daily_return

In [757]:
def weekly_return(start, lookback, df, date_time=True): #Calculate weekly return
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1] #Include day?
        df = df.iloc[::5, :]
        df = df.reset_index()
        weekly_change_sum = 0
        for i in range(1, len(df)): #Would really like to not be hacky about this...but oh well
            weekly_change_sum += ((df.iloc[i]['CLOSE'] - df.iloc[i-1]['CLOSE']) / df.iloc[i-1]['OPEN']) * 100
        weekly_change_avg = weekly_change_sum / len(df)-1
        return weekly_change_avg

In [758]:
def monthly_return(start, lookback, df, date_time=True): #Calculate monthly return
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        if(lookback < 30):
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1] #Include day?
        df = df.iloc[::30, :]
        df = df.reset_index()
        monthly_change_sum = 0
        for i in range(1, len(df)): #Would really like to not be hacky about this...but oh well
            monthly_change_sum += ((df.iloc[i]['CLOSE'] - df.iloc[i-1]['CLOSE']) / df.iloc[i-1]['OPEN']) * 100
        monthly_change_avg = monthly_change_sum / len(df)-1
        return monthly_change_avg

In [759]:
def EMA(start, lookback, df): #Calculate MACD w/ appropriate subcalculations
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1]
        df = df.reset_index()
        SMA = (df['CLOSE'][0:lookback-1].sum()) / lookback
        k = 2 / (lookback + 1)
        EMA = (df.loc[df['DATE']==start]['CLOSE'] * k) + (SMA * (1-k))
        return EMA
    
def MACD(start, lookback1, lookback2, df, date_time=True):
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback2) < 0:
            return None
        EMA_short = EMA(start, lookback1, df)
        EMA_long = EMA(start, lookback2, df)
        return float(EMA_short) - float(EMA_long)

In [760]:
def ratio_avg_to_close(start, lookback, df, date_time=True): #Calc ratios
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])-lookback) < 0:
            return None
        df = df[(df.index[df["DATE"]==start][0])-lookback:df.index[df["DATE"]==start][0]+1]
        df = df.reset_index()
        mean_close = df[0:lookback]['CLOSE'].sum() / lookback
        ratio = mean_close / df.iloc[lookback]['CLOSE']
        return ratio

In [761]:
def target(start, look_forward, df, target, date_time=True): #Calculate target @ # of days
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])+look_forward) > len(df):
            return None
        df = df[(df.index[df["DATE"]==start][0]):df.index[df["DATE"]==start][0]+look_forward] #+1? 9 or 10 days?
        df = df.reset_index()
        percent_increase = (df.iloc[look_forward-1]['CLOSE'] - df.iloc[0]['CLOSE']) / df.iloc[0]['CLOSE'] * 100
        if percent_increase >= target:
            return True
        else:
            return False

In [762]:
def calcutron(stock_data, stock_symbols, csv_export, csv_name, percent_increase, days_increase): #All the calculations
    df_list = []
    for stock in stock_symbols:
        df = stock_data.loc[stock_data["TICKER"]==stock]
        for num in [5,10,30,60]:
            df['RSI ' + str(num)] = df.apply(lambda row: RSI_calc(row['DATE'], num, df, True), axis=1)
            df["AVG DAILY RETURN " + str(num)] = df.apply(lambda row: daily_return(row['DATE'], num, df, True), axis=1)
            df["AVG WEEKLY RETURN " + str(num)] = df.apply(lambda row: weekly_return(row['DATE'], num, df, True), axis=1)
            df["AVG MONTHLY RETURN " + str(num)] = df.apply(lambda row: monthly_return(row['DATE'], num, df, True), axis=1)
            df["RATIO " + str(num)] = df.apply(lambda row: ratio_avg_to_close(row['DATE'], num, df, True), axis=1)
        df["MACD 10_30"] = df.apply(lambda row: MACD(row['DATE'], 10, 30, df, True), axis=1)
        df["MACD 5_10"] = df.apply(lambda row: MACD(row['DATE'], 5, 10, df, True), axis=1)
        df["MACD 2_10"] = df.apply(lambda row: MACD(row['DATE'], 2, 10, df, True), axis=1)
        df['MACD 10_30 DIFF'] = df['MACD 10_30'].diff()
        df['MACD 5_10 DIFF'] = df['MACD 5_10'].diff()
        df['MACD 2_10 DIFF'] = df['MACD 2_10'].diff()
        df["TARGET"] = df.apply(lambda row: target(row['DATE'], days_increase, df, percent_increase, True), axis=1)
        df_list.append(df)
    stock_data = pd.concat(df_list)
    stock_data['TARGET'].value_counts(normalize=True) * 100
    if csv_export:
        stock_data.to_excel(csv_name)
    return stock_data

In [763]:
def just_stock_data_within(train_stocks, export_csv, csv_name, days, percent_increase, days_increase):
    stock_data = pd.DataFrame(columns=["Ticker","Date","Open","High","Low","Close"])
    for stock in train_stocks:
        y_finance_data = yf.Ticker(stock).history(start=(datetime.date.today()-datetime.timedelta(days=days)).strftime("%Y-%m-%d")).reset_index()
        y_finance_data.insert(0,"Ticker", stock)
        y_finance_data = y_finance_data[['Ticker', 'Date','Open','High','Low','Close']]
        stock_data = pd.concat([stock_data, y_finance_data])
    stock_data = stock_data.rename(columns={"Ticker":"TICKER", "Date":"DATE", "Open":"OPEN", "High":"HIGH", "Low":"LOW", "Close":"CLOSE"})
    stock_data = calcutron_within(stock_data, train_stocks, export_csv, csv_name, percent_increase, days_increase)

    stock_data = stock_data.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
    stock_data = stock_data.dropna()

    input_patterns = stock_data[features][60:len(stock_data)-10]
    input_label = stock_data['TARGET'][60:len(stock_data)-10].astype(int) #Why are these floats? It bothers me.
    return stock_data, input_patterns, input_label

In [764]:
def just_stock_data(train_stocks, export_csv, csv_name, days, percent_increase, days_increase):
    stock_data = pd.DataFrame(columns=["Ticker","Date","Open","High","Low","Close"])
    for stock in train_stocks:
        y_finance_data = yf.Ticker(stock).history(start=(datetime.date.today()-datetime.timedelta(days=days)).strftime("%Y-%m-%d")).reset_index()
        y_finance_data.insert(0,"Ticker", stock)
        y_finance_data = y_finance_data[['Ticker', 'Date','Open','High','Low','Close']]
        stock_data = pd.concat([stock_data, y_finance_data])
    stock_data = stock_data.rename(columns={"Ticker":"TICKER", "Date":"DATE", "Open":"OPEN", "High":"HIGH", "Low":"LOW", "Close":"CLOSE"})
    stock_data = calcutron(stock_data, train_stocks, export_csv, csv_name, percent_increase, days_increase)

    stock_data = stock_data.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
    stock_data = stock_data.dropna()

    input_patterns = stock_data[features][60:len(stock_data)-10]
    input_label = stock_data['TARGET'][60:len(stock_data)-10].astype(int) #Why are these floats? It bothers me.
    return stock_data, input_patterns, input_label

In [765]:
#Create base model, read from CSV if we have one, perfrom 100 metric trials
def model_creation(train_stocks, test_size, read_csv, export_csv, csv_name, days, percent_increase, days_increase, features=None, prints=False):
    if read_csv:
        stock_data = pd.read_excel(csv_name, engine='openpyxl')
    else:
        stock_data = pd.DataFrame(columns=["Ticker","Date","Open","High","Low","Close"])
        for stock in train_stocks:
            y_finance_data = yf.Ticker(stock).history(start=(datetime.date.today()-datetime.timedelta(days=days)).strftime("%Y-%m-%d")).reset_index()
            y_finance_data.insert(0,"Ticker", stock)
            y_finance_data = y_finance_data[['Ticker', 'Date','Open','High','Low','Close']]
            stock_data = pd.concat([stock_data, y_finance_data])
        stock_data = stock_data.rename(columns={"Ticker":"TICKER", "Date":"DATE", "Open":"OPEN", "High":"HIGH", "Low":"LOW", "Close":"CLOSE"})
        stock_data = calcutron(stock_data, train_stocks, export_csv, csv_name, percent_increase, days_increase)

    stock_data = stock_data.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
    stock_data = stock_data.dropna()
    print(stock_data)
    if features == None:
        input_patterns = stock_data.loc[:,'RSI 5':'MACD 2_10 DIFF'][60:len(stock_data)-10]
    else:
        input_patterns = stock_data[features][60:len(stock_data)-10]
    
    input_label = stock_data['TARGET'][60:len(stock_data)-10].astype(int) #Why are these floats? It bothers me.
    if prints:
        print(input_patterns)
        print(stock_data['TARGET'].value_counts(normalize=True) * 100)
    
    accuracy_trials = []
    recall_trials = []
    precision_trials = []
    auc_trials = []

    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = test_size)
        model = XGBClassifier(eval_metric="auc", use_label_encoder=False)
        model.fit(X_train, y_train)
        accuracy_trials.append(accuracy_score(y_test, model.predict(X_test)) * 100.0)
        precision_trials.append(precision_score(y_test, model.predict(X_test)) * 100.0)
        recall_trials.append(recall_score(y_test, model.predict(X_test)) * 100.0)
        average_precision_trials.append(average_precision_score(y_test, model.predict(X_test)))
        auc_trials.append(roc_auc_score(y_test, model.predict(X_test)))

    mean_accuracy = sum(accuracy_trials) / len(accuracy_trials)
    mean_precision = sum(precision_trials) / len(precision_trials)
    mean_recall= sum(recall_trials) / len(recall_trials)
    mean_auc = sum(auc_trials) / len(auc_trials)

    if prints:
        print("Validation Trial")
        print("Mean Accuracy: {}%".format(mean_accuracy))
        print("Mean Precision: {}%".format(mean_precision))
        print("Mean Recall: {}%".format(mean_recall))
        print("Mean AUC: {}%".format(mean_auc))
        print(model)
    return model, stock_data, X_train, X_test, y_train, y_test

In [766]:
#Perform testing trials on given stock
def trials(model, threshold, test_stocks, days, export_csv, csv_name, percent_increase, days_increase, features, prints=False):
    #Add ability to read from CSV here?
    for stock in test_stocks:
        print(stock)
        stock_data_testing = pd.DataFrame(columns=["Ticker","Date","Open","High","Low","Close"])
        y_finance_data = yf.Ticker(stock).history(start=(datetime.date.today()-datetime.timedelta(days=days)).strftime("%Y-%m-%d")).reset_index()
        y_finance_data.insert(0,"Ticker", stock)
        y_finance_data = y_finance_data[['Ticker', 'Date','Open','High','Low','Close']]

        stock_data_testing = pd.concat([stock_data_testing, y_finance_data])
        stock_data_testing = stock_data_testing.rename(columns={"Ticker":"TICKER", "Date":"DATE", "Open":"OPEN", "High":"HIGH", "Low":"LOW", "Close":"CLOSE"})
        stock_data_testing = calcutron(stock_data_testing, [stock], False, None, percent_increase, days_increase)
        stock_data_testing = stock_data_testing.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
        stock_data_testing = stock_data_testing.dropna()
        if features == None:
            input_patterns_testing = stock_data_testing.loc[:,'RSI 5':'MACD 2_10 DIFF'][60:len(stock_data)-10]
        else:
            input_patterns_testing = stock_data_testing[features][60:len(stock_data)-10]
    
        input_label_testing = stock_data_testing['TARGET'][60:len(stock_data)-10].astype(int)
        print(input_patterns_testing)
        #if prints:
            #print(stock_data_testing)
            #print(input_patterns_testing)
            #print(input_label_testing)

        accuracy_trials_testing = []
        recall_trials_testing = []
        precision_trials_testing = []
        auc_trials_testing = []

        y_prob = model.predict_proba(input_patterns_testing)
        y_prob_round = []
        for row in y_prob:
            if row[1] > threshold:
                y_prob_round.append(1)
            else:
                y_prob_round.append(0)
        y_pred = model.predict(input_patterns_testing)
        
        accuracy_trials_testing.append(accuracy_score(input_label_testing, y_pred) * 100.0)
        precision_trials_testing.append(precision_score(input_label_testing, y_pred) * 100.0)
        recall_trials_testing.append(recall_score(input_label_testing, y_pred) * 100.0)
        auc_trials_testing.append(roc_auc_score(input_label_testing, y_pred) * 100)
        mean_accuracy_testing = sum(accuracy_trials_testing) / len(accuracy_trials_testing)
        mean_precision_testing = sum(precision_trials_testing) / len(precision_trials_testing)
        mean_recall_testing = sum(recall_trials_testing) / len(recall_trials_testing)
        mean_auc_testing = sum(auc_trials_testing) / len(auc_trials_testing)

        if prints:
            print(stock)
            print("Standard Threshold")
            print("Accuracy {}: {}%".format(stock, mean_accuracy_testing))
            print("Precision {}: {}%".format(stock, mean_precision_testing))
            print("Recall {}: {}%".format(stock, mean_recall_testing))
            print("AUC {}: {}%".format(stock, mean_auc_testing))
            print()
            
        accuracy_trials_testing = []
        recall_trials_testing = []
        precision_trials_testing = []
        auc_trials_testing = []

        accuracy_trials_testing.append(accuracy_score(input_label_testing, y_prob_round) * 100.0)
        precision_trials_testing.append(precision_score(input_label_testing, y_prob_round) * 100.0)
        recall_trials_testing.append(recall_score(input_label_testing, y_prob_round) * 100.0)
        auc_trials_testing.append(roc_auc_score(input_label_testing, y_pred) * 100)
        mean_accuracy_testing = sum(accuracy_trials_testing) / len(accuracy_trials_testing)
        mean_precision_testing = sum(precision_trials_testing) / len(precision_trials_testing)
        mean_recall_testing = sum(recall_trials_testing) / len(recall_trials_testing)
        mean_auc_testing = sum(auc_trials_testing) / len(auc_trials_testing)

        if prints:
            print("Rounding Threshold: {}".format(threshold))
            print("Accuracy {}: {}%".format(stock, mean_accuracy_testing))
            print("Precision {}: {}%".format(stock, mean_precision_testing))
            print("Recall {}: {}%".format(stock, mean_recall_testing))
            print("AUC {}: {}%".format(stock, mean_auc_testing))
            
    return stock_data

In [767]:
def target_within(start, look_forward, df, target, date_time=True): #New target function for within # of days instead of @
        if date_time:
            start = start.strftime("%Y-%m-%d")
        if ((df.index[df["DATE"]==start][0])+look_forward) > len(df):
            return None
        df = df[(df.index[df["DATE"]==start][0]):df.index[df["DATE"]==start][0]+look_forward+1] #+1? 9 or 10 days?
        df = df.reset_index()
        for i in range(1,len(df)):
            percent_increase = (df.iloc[i]['CLOSE'] - df.iloc[0]['CLOSE']) / df.iloc[0]['CLOSE'] * 100
            if percent_increase >= target:
                return True
        return False

def calcutron_within(stock_data, stock_symbols, csv_export, csv_name, percent_increase, days_increase): #Ditto as above
    df_list = []
    for stock in stock_symbols:
        df = stock_data.loc[stock_data["TICKER"]==stock]
        for num in [5,10,30,60]:
            df['RSI ' + str(num)] = df.apply(lambda row: RSI_calc(row['DATE'], num, df, True), axis=1)
            df["AVG DAILY RETURN " + str(num)] = df.apply(lambda row: daily_return(row['DATE'], num, df, True), axis=1)
            df["AVG WEEKLY RETURN " + str(num)] = df.apply(lambda row: weekly_return(row['DATE'], num, df, True), axis=1)
            df["AVG MONTHLY RETURN " + str(num)] = df.apply(lambda row: monthly_return(row['DATE'], num, df, True), axis=1)
            df["RATIO " + str(num)] = df.apply(lambda row: ratio_avg_to_close(row['DATE'], num, df, True), axis=1)
        df["MACD 10_30"] = df.apply(lambda row: MACD(row['DATE'], 10, 30, df, True), axis=1)
        df["MACD 5_10"] = df.apply(lambda row: MACD(row['DATE'], 5, 10, df, True), axis=1)
        df["MACD 2_10"] = df.apply(lambda row: MACD(row['DATE'], 2, 10, df, True), axis=1)
        df['MACD 10_30 DIFF'] = df['MACD 10_30'].diff()
        df['MACD 5_10 DIFF'] = df['MACD 5_10'].diff()
        df['MACD 2_10 DIFF'] = df['MACD 2_10'].diff()
        df["TARGET"] = df.apply(lambda row: target_within(row['DATE'], days_increase, df, percent_increase, True), axis=1)
        df_list.append(df)
    stock_data = pd.concat(df_list)
    stock_data['TARGET'].value_counts(normalize=True) * 100
    if csv_export:
        stock_data.to_excel(csv_name)
    return stock_data


def model_creation_within(train_stocks, test_size, read_csv, export_csv, csv_name, days, percent_increase, days_increase, features=None, prints=False):
    if read_csv:
        stock_data = pd.read_excel(csv_name, engine='openpyxl')
    else:
        stock_data = pd.DataFrame(columns=["Ticker","Date","Open","High","Low","Close"])
        for stock in train_stocks:
            y_finance_data = yf.Ticker(stock).history(start=(datetime.date.today()-datetime.timedelta(days=days)).strftime("%Y-%m-%d")).reset_index()
            y_finance_data.insert(0,"Ticker", stock)
            y_finance_data = y_finance_data[['Ticker', 'Date','Open','High','Low','Close']]
            stock_data = pd.concat([stock_data, y_finance_data])
        stock_data = stock_data.rename(columns={"Ticker":"TICKER", "Date":"DATE", "Open":"OPEN", "High":"HIGH", "Low":"LOW", "Close":"CLOSE"})
        stock_data = calcutron_within(stock_data, train_stocks, export_csv, csv_name, percent_increase, days_increase)

    stock_data = stock_data.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
    stock_data = stock_data.dropna()
    print(stock_data)
    if features == None:
        input_patterns = stock_data.loc[:,'RSI 5':'MACD 2_10 DIFF'][60:len(stock_data)-10]
    else:
        input_patterns = stock_data[features][60:len(stock_data)-10]
    
    input_label = stock_data['TARGET'][60:len(stock_data)-10].astype(int) #Why are these floats? It bothers me.
    if prints:
        print(input_patterns)
        print(stock_data['TARGET'].value_counts(normalize=True) * 100)
    
    accuracy_trials = []
    recall_trials = []
    precision_trials = []
    auc_trials = []

    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = test_size)
        model = XGBClassifier(eval_metric="auc", use_label_encoder=False)
        model.fit(X_train, y_train)
        accuracy_trials.append(accuracy_score(y_test, model.predict(X_test)) * 100.0)
        precision_trials.append(precision_score(y_test, model.predict(X_test)) * 100.0)
        recall_trials.append(recall_score(y_test, model.predict(X_test)) * 100.0)
        #average_precision_trials.append(average_precision_score(y_test, model.predict(X_test)))
        auc_trials.append(roc_auc_score(y_test, model.predict(X_test)) * 100)

    mean_accuracy = sum(accuracy_trials) / len(accuracy_trials)
    mean_precision = sum(precision_trials) / len(precision_trials)
    mean_recall= sum(recall_trials) / len(recall_trials)
    mean_auc = sum(auc_trials) / len(auc_trials)

    if prints:
        print("Validation Trial")
        print("Mean Accuracy: {}%".format(mean_accuracy))
        print("Mean Precision: {}%".format(mean_precision))
        print("Mean Recall: {}%".format(mean_recall))
        print("Mean AUC: {}%".format(mean_auc))
        print(model)
    return model, stock_data, X_train, X_test, y_train, y_test

def trials_within(model, threshold, test_stocks, days, export_csv, csv_name, percent_increase, days_increase, features, prints=False):
    #Add ability to read from CSV here?
    for stock in test_stocks:
        print(stock)
        stock_data_testing = pd.DataFrame(columns=["Ticker","Date","Open","High","Low","Close"])
        y_finance_data = yf.Ticker(stock).history(start=(datetime.date.today()-datetime.timedelta(days=days)).strftime("%Y-%m-%d")).reset_index()
        y_finance_data.insert(0,"Ticker", stock)
        y_finance_data = y_finance_data[['Ticker', 'Date','Open','High','Low','Close']]

        stock_data_testing = pd.concat([stock_data_testing, y_finance_data])
        stock_data_testing = stock_data_testing.rename(columns={"Ticker":"TICKER", "Date":"DATE", "Open":"OPEN", "High":"HIGH", "Low":"LOW", "Close":"CLOSE"})
        stock_data_testing = calcutron_within(stock_data_testing, [stock], False, None, percent_increase, days_increase)
        stock_data_testing = stock_data_testing.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
        stock_data_testing = stock_data_testing.dropna()
        if features == None:
            input_patterns_testing = stock_data_testing.loc[:,'RSI 5':'MACD 2_10 DIFF'][60:len(stock_data)-10]
        else:
            input_patterns_testing = stock_data_testing[features][60:len(stock_data)-10]
    
        input_label_testing = stock_data_testing['TARGET'][60:len(stock_data)-10].astype(int)
        print(input_patterns_testing)
        #if prints:
            #print(stock_data_testing)
            #print(input_patterns_testing)
            #print(input_label_testing)

        accuracy_trials_testing = []
        recall_trials_testing = []
        precision_trials_testing = []
        auc_trials_testing = []

        y_prob = model.predict_proba(input_patterns_testing)
        y_prob_round = []
        for row in y_prob:
            if row[1] > threshold:
                y_prob_round.append(1)
            else:
                y_prob_round.append(0)
        y_pred = model.predict(input_patterns_testing)
        
        accuracy_trials_testing.append(accuracy_score(input_label_testing, y_pred) * 100.0)
        precision_trials_testing.append(precision_score(input_label_testing, y_pred) * 100.0)
        recall_trials_testing.append(recall_score(input_label_testing, y_pred) * 100.0)
        auc_trials_testing.append(roc_auc_score(input_label_testing, y_pred) * 100)
        mean_accuracy_testing = sum(accuracy_trials_testing) / len(accuracy_trials_testing)
        mean_precision_testing = sum(precision_trials_testing) / len(precision_trials_testing)
        mean_recall_testing = sum(recall_trials_testing) / len(recall_trials_testing)
        mean_auc_testing = sum(auc_trials_testing) / len(auc_trials_testing)

        if prints:
            print(stock)
            print("Standard Threshold")
            print("Accuracy {}: {}%".format(stock, mean_accuracy_testing))
            print("Precision {}: {}%".format(stock, mean_precision_testing))
            print("Recall {}: {}%".format(stock, mean_recall_testing))
            print("AUC {}: {}%".format(stock, mean_auc_testing))
            print()
        
        accuracy_trials_testing = []
        recall_trials_testing = []
        precision_trials_testing = []
        auc_trials_testing = []

        accuracy_trials_testing.append(accuracy_score(input_label_testing, y_prob_round) * 100.0)
        precision_trials_testing.append(precision_score(input_label_testing, y_prob_round) * 100.0) #This is unnecessary lol
        recall_trials_testing.append(recall_score(input_label_testing, y_prob_round) * 100.0)
        auc_trials_testing.append(roc_auc_score(input_label_testing, y_pred) * 100)
        mean_accuracy_testing = sum(accuracy_trials_testing) / len(accuracy_trials_testing)
        mean_precision_testing = sum(precision_trials_testing) / len(precision_trials_testing)
        mean_recall_testing = sum(recall_trials_testing) / len(recall_trials_testing)
        mean_auc_testing = sum(auc_trials_testing) / len(auc_trials_testing)

        if prints:
            print("Rounding Threshold: {}".format(threshold))
            print("Accuracy {}: {}%".format(stock, mean_accuracy_testing))
            print("Precision {}: {}%".format(stock, mean_precision_testing))
            print("Recall {}: {}%".format(stock, mean_recall_testing))
            print("AUC {}: {}%".format(stock, mean_auc_testing))
            
    return stock_data

In [768]:
def relevant_features(model, X_train, y_train, X_test, y_test, threshold):
    model.fit(X_train, y_train)
    plot_importance(model)
    pyplot.show()
    
    thresholds = sort(model.feature_importances_) #Sort through and thresholding out the unnecessary features from above model
    for thresh in thresholds:
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)
        selection_model = XGBClassifier(use_label_encoder=False, eval_metric="auc")
        selection_model.fit(select_X_train, y_train)
        select_X_test = selection.transform(X_test)
        y_pred = selection_model.predict(select_X_test)
        auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        print("Thresh=%.3f, n=%d, AUC: %.2f%%" % (thresh, select_X_train.shape[1], auc*100.0))
        print("Thresh=%.3f, n=%d, Precision: %.2f%%" % (thresh, select_X_train.shape[1], precision*100.0))
    
    print("-----------------ROUNDING THRESHOLD OF {}---------------------".format(threshold))
    
    thresholds = sort(model.feature_importances_) #Sort through and thresholding out the unnecessary features from above model
    for thresh in thresholds:
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)
        selection_model = XGBClassifier(use_label_encoder=False, eval_metric="auc")
        selection_model.fit(select_X_train, y_train)
        select_X_test = selection.transform(X_test)
        y_pred = selection_model.predict_proba(select_X_test)
        y_prob_round = []
        for row in y_pred:
            if row[1] > threshold:
                y_prob_round.append(1)
            else:
                y_prob_round.append(0)
        y_pred = y_prob_round                          
        auc = roc_auc_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        print("Thresh=%.3f, n=%d, AUC: %.2f%%" % (thresh, select_X_train.shape[1], auc*100.0))
        print("Thresh=%.3f, n=%d, Precision: %.2f%%" % (thresh, select_X_train.shape[1], precision*100.0))

In [769]:
def hyper_parameter(model, X_train, y_train):
    #Hypertuning of parameters ala https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
    param_test1 = {
    'max_depth':range(9,11,1), #Room for tuning here
    'min_child_weight':range(1,3,1),
    'gamma':[i/10. for i in range(0,5)]
    }

    gsearch1 = GridSearchCV(estimator = model, param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
    gsearch1.fit(X_train,y_train)
    print("TEST EVERYTHING")
    print("CV Results: ", gsearch1.cv_results_)
    print("Best Params: ", gsearch1.best_params_)
    print("Best Score: ", gsearch1.best_score_)
    return gsearch1.best_params_

In [770]:
def n_estimator(model, X_train, y_train):
    xgb_param = model.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train, label=y_train)
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50)
    print(cvresult)
    print("Ideal n_estimators: ", cvresult.shape[0])
    model.set_params(n_estimators=cvresult.shape[0])
    return model

In [None]:
#'Within' testing w/ random sampling

#Getting our data, outputting to excel, getting X_train, X_test, y_train, y_test, etc.
stock_symbols = ["KR", "WMT", "KO", "NVDA", "F", "XOM", "AMD"]
stock_data, input_patterns, input_label = just_stock_data_within(stock_symbols, True, "WITHIN_STOCK.xlsx", 1800, 5, 10)
threshold=.85 #Rounding threshold

#If we have an excel file, use this:
#stock_data = pd.read_excel(csv_name, engine='openpyxl')
#stock_data = stock_data.drop(['AVG MONTHLY RETURN 5','AVG MONTHLY RETURN 10'], axis=1)
#stock_data = stock_data.dropna()
#input_patterns = stock_data[features][60:len(stock_data)-10]
#input_label = stock_data['TARGET'][60:len(stock_data)-10].astype(int) #Why are these floats? It bothers me.

X_train, X_test, y_train, y_test = train_test_split(input_patterns, input_label, test_size = .33) #Currently random splitting, look into 2yr, 1yr, 1yr of SPY
#####################

#Baseline testing
accuracy_trials = []
recall_trials = []
precision_trials = []
auc_trials = []

for i in range(100):
    model = XGBClassifier(eval_metric="auc", use_label_encoder=False)
    model.fit(X_train, y_train)
    
    accuracy_trials.append(accuracy_score(y_test, model.predict(X_test)) * 100.0)
    precision_trials.append(precision_score(y_test, model.predict(X_test)) * 100.0)
    recall_trials.append(recall_score(y_test, model.predict(X_test)) * 100.0)
    auc_trials.append(roc_auc_score(y_test, model.predict(X_test)) * 100)

mean_accuracy = sum(accuracy_trials) / len(accuracy_trials)
mean_precision = sum(precision_trials) / len(precision_trials)
mean_recall= sum(recall_trials) / len(recall_trials)
mean_auc = sum(auc_trials) / len(auc_trials)

print("BASELINE TRIAL")
print("Mean Accuracy: {}%".format(mean_accuracy))
print("Mean Precision: {}%".format(mean_precision))
print("Mean Recall: {}%".format(mean_recall))
print("Mean AUC: {}%".format(mean_auc))

accuracy_trials = []
recall_trials = []
precision_trials = []
auc_trials = []

for i in range(100):
    model = XGBClassifier(eval_metric="auc", use_label_encoder=False)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)
    y_prob_round = []
    for row in y_pred:
        if row[1] > threshold:
            y_prob_round.append(1)
        else:
            y_prob_round.append(0)
    y_pred = y_prob_round    
    accuracy_trials.append(accuracy_score(y_test, y_pred) * 100.0)
    precision_trials.append(precision_score(y_test, y_pred) * 100.0)
    recall_trials.append(recall_score(y_test, y_pred) * 100.0)
    auc_trials.append(roc_auc_score(y_test, y_pred) * 100)

mean_accuracy = sum(accuracy_trials) / len(accuracy_trials)
mean_precision = sum(precision_trials) / len(precision_trials)
mean_recall= sum(recall_trials) / len(recall_trials)
mean_auc = sum(auc_trials) / len(auc_trials)

print("BASELINE TRIAL w/ THRESHOLDING")
print("Mean Accuracy: {}%".format(mean_accuracy))
print("Mean Precision: {}%".format(mean_precision))
print("Mean Recall: {}%".format(mean_recall))
print("Mean AUC: {}%".format(mean_auc))

print(model)
#####################

#Univariate plots
X_train_full = copy.deepcopy(X_train)
X_test_full = copy.deepcopy(X_test)
X_train_full['TARGET'] = y_train
X_test_full['TARGET'] = y_test
get_univariate_plots(data=X_train_full, target_col ="TARGET", data_test=X_test_full)
##################

#Creating new default model
model = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
    eval_metric="auc", 
    use_label_encoder=False)
##################

#Extract relevant features, obtain new data w/ selected features
relevant_features(model, X_train, y_train, X_test, y_test, threshold)
selection = SelectFromModel(model, threshold=.038, prefit=True) 
print(selection)
select_X_train = selection.transform(X_train)
select_X_test = selection.transform(X_test)
##################

#N_estimator -> hyper_parameterization -> N_estimator again
model = n_estimator(model, select_X_train, y_train)
best_params = hyper_parameter(model, select_X_train, y_train)
model.set_params(gamma=best_params['gamma'], max_depth=best_params['max_depth'], min_child_weight=best_params['min_child_weight'])
model = n_estimator(model, select_X_train, y_train)
dump(model, 'trained_model_all_features.joblib')
print(model)
##################

#Test model on selected test data
y_pred = model.predict(select_X_test)
auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("AUC: {}".format(auc*100.0))
print("Precision: {}".format(precision*100.0))
print("Accuracy: {}".format(accuracy*100.0))

print("ROUNDING THRESHOLD OF {}".format(threshold))
y_pred = model.predict_proba(select_X_test)
y_prob_round = []
for row in y_pred:
    if row[1] > threshold:
        y_prob_round.append(1)
    else:
        y_prob_round.append(0)
y_pred = y_prob_round           
auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("AUC: {}".format(auc*100.0))
print("Precision: {}".format(precision*100.0))
print("Accuracy: {}".format(accuracy*100.0))
##################

#Test model on validation data
#test_stocks = ["AAPL", "X", "TGT", "INTC", "JNJ"] 
#stock_data_test = trials_within(model, .85, test_stocks, 1800, False, None, 5, 10, features=None, prints=True)
#dump(model, 'trained_model_all_features.joblib')