# Run XGBoost on all companies

In [2]:
import requests
from pprint import pprint
import mplfinance
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import json
import time
from tqdm import tqdm
import calendar
import os
import logging
colors = sns.color_palette()
sns.set()
pd.set_option('display.max_colwidth',1000)

### Load all stocks datasets

In [60]:
stock_dfs = {}
for file in os.listdir('../daily_prices_dataset/stock_data/'):
    # read the csv file and store the stock name in a dictionary
    stock_df = pd.read_csv('../daily_prices_dataset/stock_data/'+file)
    stock_df.index = pd.to_datetime(stock_df['date'])
    stock_dfs[file[:-4]] = stock_df.drop(columns=['date', 'stock'])
stock_dfs['NFLX']

Unnamed: 0_level_0,open,high,low,close,adj_close,volume,dividend,split,30_day_MA,50_day_MA,100_day_MA,200_day_MA,4_week_high,4_week_low,10_week_high,10_week_low,52_week_high,52_week_low
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2023-02-10,359.160,362.1400,347.1400,347.36,347.36,7291096,0.0,1.0,339.236000,339.236000,293.74165,249.26820,367.96,294.95,367.96,254.66,691.69,166.37
2023-02-09,372.410,373.8300,361.7444,362.50,362.50,6901100,0.0,1.0,336.886667,336.886667,292.70435,248.52340,367.96,294.88,367.96,254.66,691.69,166.37
2023-02-08,360.020,368.1930,358.3100,366.83,366.83,6253179,0.0,1.0,334.275667,334.275667,291.48065,247.76045,367.96,291.12,367.96,254.66,691.69,166.37
2023-02-07,358.510,364.1799,354.1800,362.95,362.95,6289368,0.0,1.0,331.880167,331.880167,290.16615,247.00390,367.96,276.88,367.96,254.66,691.69,166.37
2023-02-06,363.642,368.4500,360.6800,361.48,361.48,4994942,0.0,1.0,329.706833,329.706833,288.77785,246.28025,367.96,276.88,367.96,254.66,691.69,166.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-22,534.990,541.7900,530.7900,533.78,533.78,3061747,0.0,1.0,540.053667,540.053667,516.94680,495.87430,586.34,494.25,586.34,476.62,586.34,254.59
2021-02-19,548.000,548.9900,538.8132,540.22,540.22,2841457,0.0,1.0,539.224000,539.224000,516.51550,495.37670,586.34,494.25,586.34,470.50,586.34,254.59
2021-02-18,549.000,550.0000,538.2265,548.22,548.22,2457604,0.0,1.0,537.899667,537.899667,515.94210,494.79900,586.34,494.25,586.34,470.50,586.34,254.59
2021-02-17,550.990,555.2500,543.0300,551.34,551.34,2057853,0.0,1.0,536.985667,536.985667,515.19070,494.19865,586.34,494.25,586.34,470.50,586.34,254.59


## Run XGBoost on a specific stock

In [4]:
from copy import deepcopy
from typing import Dict
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.metrics import r2_score

In [85]:
def calculate_close_price(stock: str, stock_df: pd.DataFrame) -> Dict:
    df = deepcopy(stock_df)
    # sort the dataframe by date
    df.sort_index(ascending=False, inplace=True)
    # get the first row of the dataframe
    first_row = df.iloc[0]
    # raise the all the columns up by one day so that the model only gets the daily open price and the rest of the data from yesterday
    for col in set(df.columns) - {'close', 'open'}: df[col] = df[col][1:].shift(-1)
    # shift the close column by 1
    df['close'] = df['close'][:-1]
    df = df.dropna(axis=0)

    df.index = pd.to_datetime(df.index)
    df.sort_index(ascending=False, inplace=True)
    # convert all columns to numeric except for the index
    df = df.apply(pd.to_numeric, errors='ignore')
    # drop any column that contains a date time value
    # df = df.drop(columns=[col for col in df.columns if df[col].dtype != 'float64']).dropna()
    df = df.dropna(axis=0)
    # get all times before the last month
    train_df, test_df = df[df.index <= df.index.max() - pd.DateOffset(months=2)], df[df.index >= df.index.max() - pd.DateOffset(months=2)]
    reg = xgb.XGBRegressor(n_estimators=1000, early_stopping_rounds=200, eval_metric='rmse', learning_rate=0.1, max_depth=10, subsample=0.8, colsample_bytree=0.8, random_state=42)
    # split into train and test sets
    X_train, y_train = train_df.drop(columns=['close']), train_df['close']
    X_test, y_test = test_df.drop(columns=['close']), test_df['close']
    reg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)
    # validate the model
    X_test['prediction'] = reg.predict(X_test)
    # rmse score as a percentage of the mean of the close prices
    score = np.sqrt(mean_squared_error(y_test, X_test['prediction'])) / y_test.mean()
    # R2 score
    r2 = r2_score(y_test, X_test['prediction'])
    n = len(X_test)
    p = len(X_test.columns)
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    # the predicted close price
    cur_price, prediction = first_row['close'], reg.predict(first_row.drop('close').to_numpy().reshape(1, -1))[0]
    return {'stock': stock, 'rmse': score, 'adj_r2': adj_r2, 'r2': r2, 'prediction': prediction, 'cur_price': cur_price}

calculate_close_price('AAPL', stock_dfs['AAPL'])
calculate_close_price('MSFT', stock_dfs['MSFT'])
calculate_close_price('NFLX', stock_dfs['NFLX'])

{'stock': 'NFLX',
 'rmse': 0.05779151920112351,
 'adj_r2': 0.24425138376545952,
 'r2': 0.5760434591855017,
 'prediction': 320.59186,
 'cur_price': 347.36}

In [96]:
from collections import defaultdict
daily_stock_reccomendations = defaultdict(list)

for stock, stock_df in tqdm(stock_dfs.items()):
    for key, val in calculate_close_price(stock, stock_df).items():
        daily_stock_reccomendations[key].append(val)

# sort the dataframe by the rmse score to grab the stocks with most accurate predictions
daily_stock_reccomendations = pd.DataFrame(daily_stock_reccomendations).set_index('stock').sort_values('rmse')

100%|██████████| 56/56 [00:14<00:00,  3.94it/s]


### From the top 10 best ROIs, choose the most accurately predicted stocks

### <span style="color:red">TODO Reverse the order for more conservative investing. take only stocks with positive r2 and sort them by return ratios </span>

In [100]:
# Create a new column with the ratio of the predicted price to the current price
daily_stock_reccomendations['ratio'] = daily_stock_reccomendations['prediction'] / daily_stock_reccomendations['cur_price']
# remove rows with adj_r2 more than 2
daily_stock_reccomendations = daily_stock_reccomendations[(daily_stock_reccomendations['adj_r2'] < 2) & (daily_stock_reccomendations['r2'] > 0)]
# sort the dataframe by the ratio and then by the rmse score to grab the stocks with most accurate predictions
stock_recs = daily_stock_reccomendations.sort_values('ratio', ascending=False).head(10).sort_values('rmse').head(5)
stock_recs

Unnamed: 0_level_0,rmse,adj_r2,r2,prediction,cur_price,ratio
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
INFY,0.008573,0.805801,0.891059,19.29759,19.16,1.007181
BCS,0.014227,0.93247,0.962117,9.116353,9.05,1.007332
GE,0.021875,0.543106,0.743693,83.254402,81.29,1.024165
APG,0.02426,0.78639,0.88017,21.583002,21.33,1.011861
GOOG,0.024847,0.713297,0.839167,95.884811,94.86,1.010803


## Repeat for weekly and monthly datasets

In [104]:
weekly_stock_reccomendations = defaultdict(list)

for stock, stock_df in tqdm(stock_dfs.items()):
    weekly_df = stock_df.resample('W').agg({'open': 'first', 'high': 'first', 'low': 'first', 'close': 'last', 'adj_close': 'last', 'volume': 'first', '30_day_MA': 'first', '50_day_MA': 'first', '100_day_MA': 'first', '200_day_MA': 'first', '4_week_high': 'first', '4_week_low': 'first', '10_week_high': 'first', '10_week_low': 'first', '52_week_high': 'first', '52_week_low': 'first'})
    for key, val in calculate_close_price(stock, weekly_df).items():
        weekly_stock_reccomendations[key].append(val)

# sort the dataframe by the rmse score to grab the stocks with most accurate predictions
weekly_stock_reccomendations = pd.DataFrame(weekly_stock_reccomendations).set_index('stock').sort_values('rmse')
# remove rows with adj_r2 more than 2
weekly_stock_reccomendations = weekly_stock_reccomendations[(weekly_stock_reccomendations['adj_r2'] < 2)]
# Create a new column with the ratio of the predicted price to the current price
weekly_stock_reccomendations['ratio'] = weekly_stock_reccomendations['prediction'] / weekly_stock_reccomendations['cur_price']
weekly_stock_recs = weekly_stock_reccomendations.sort_values('ratio', ascending=False).head(10).sort_values('rmse').head(5)
weekly_stock_recs

100%|██████████| 56/56 [00:10<00:00,  5.32it/s]


Unnamed: 0_level_0,rmse,adj_r2,r2,prediction,cur_price,ratio
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PFE,0.026888,1.159115,0.840885,46.06316,43.88,1.049753
HCSG,0.043855,1.853695,0.146305,13.533175,12.51,1.081789
MORN,0.046607,1.716185,0.283815,240.430603,235.74,1.019897
APG,0.049315,1.547601,0.452399,21.935059,21.33,1.028367
INTC,0.050743,1.990317,0.009683,29.592604,27.8,1.064482


In [105]:
monthly_stock_reccomendations = defaultdict(list)

for stock, stock_df in tqdm(stock_dfs.items()):
    monthly_df = stock_df.resample('M').agg({'open': 'first', 'high': 'first', 'low': 'first', 'close': 'last', 'adj_close': 'last', 'volume': 'first', '30_day_MA': 'first', '50_day_MA': 'first', '100_day_MA': 'first', '200_day_MA': 'first', '4_week_high': 'first', '4_week_low': 'first', '10_week_high': 'first', '10_week_low': 'first', '52_week_high': 'first', '52_week_low': 'first'})
    for key, val in calculate_close_price(stock, monthly_df).items():
        monthly_stock_reccomendations[key].append(val)

# sort the dataframe by the rmse score to grab the stocks with most accurate predictions
monthly_stock_reccomendations = pd.DataFrame(monthly_stock_reccomendations).set_index('stock').sort_values('rmse')
# Create a new column with the ratio of the predicted price to the current price
monthly_stock_reccomendations['ratio'] = monthly_stock_reccomendations['prediction'] / monthly_stock_reccomendations['cur_price']
# remove rows with adj_r2 more than 2
monthly_stock_reccomendations = monthly_stock_reccomendations[(monthly_stock_reccomendations['adj_r2'] < 2)]
monthly_stock_recs = monthly_stock_reccomendations.sort_values('ratio', ascending=False).head(10).sort_values('rmse').head(5)
monthly_stock_recs

100%|██████████| 56/56 [00:09<00:00,  6.05it/s]


Unnamed: 0_level_0,rmse,adj_r2,r2,prediction,cur_price,ratio
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KO,0.03309,1.530167,-2.71117,62.318073,59.62,1.045254
JNJ,0.034687,1.11846,0.170778,171.368423,162.15,1.056851
HCSG,0.064253,1.147968,-0.035773,13.071095,12.51,1.044852
PFE,0.066526,1.153913,-0.077389,45.870243,43.88,1.045356
CRWD,0.074561,1.294915,-1.064408,117.5942,108.96,1.079242


### Get all the stocks that are common for the weekly and monthly analysis

In [106]:
weekly_stock_recs = weekly_stock_reccomendations.sort_values('ratio', ascending=False).head(10).sort_values('rmse')
monthly_stock_recs = monthly_stock_reccomendations.sort_values('ratio', ascending=False).head(10).sort_values('rmse')
# get the intersection of the two dataframes
weekly_stock_recs.merge(monthly_stock_recs, how='inner', on='stock')



Unnamed: 0_level_0,rmse_x,adj_r2_x,r2_x,prediction_x,cur_price_x,ratio_x,rmse_y,adj_r2_y,r2_y,prediction_y,cur_price_y,ratio_y
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
PFE,0.026888,1.159115,0.840885,46.06316,43.88,1.049753,0.066526,1.153913,-0.077389,45.870243,43.88,1.045356
HCSG,0.043855,1.853695,0.146305,13.533175,12.51,1.081789,0.064253,1.147968,-0.035773,13.071095,12.51,1.044852
NIO,0.072563,1.810695,0.189305,11.612803,10.31,1.126363,0.082652,1.07754,0.457222,10.984979,10.31,1.065468
HBI,0.084539,1.464386,0.535614,7.692443,5.61,1.371202,0.118872,1.126088,0.117386,7.755347,5.61,1.382415
