In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import pandas as pd
import numpy as np
import os
import xgboost as xgb
import matplotlib.pyplot as plt

from ta import add_all_ta_features
from ta.utils import dropna
from pathlib import Path
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

from scipy.stats import uniform, randint

In [None]:
turbo_path = "C:/_repos/hackaton-turbo/datasets/"
featured_path = "C:/_repos/hackaton-turbo/datasets/featured/"

In [None]:
def percentages_moves(df, column_old, column_new):
    for i in range(0, len(df)):
        if i == 0:
             df.loc[i, column_new] = 0
        else:
            df.loc[i, column_new] = 100 - df.loc[i, column_old] * 100 / df.loc[i-1, column_old]
            
            
def future_price(df, column_old, column_new):
    for i in range(0, len(df)):
        if i == len(df) - 1:
             df.loc[i, column_new] = df.loc[i, column_old]
        else:
            df.loc[i, column_new] = df.loc[i+1, column_old]

In [None]:
top_df = pd.read_csv("C:/_repos/hackaton-turbo/datasets/jse-percent-correlation/adjusted_price_percent_correlation_top_10.csv")
#top_df

In [None]:
# create percentage moves and future prices
stocks_markets = ['jse', 'asx', 'jpx', 'ssx']
for sm in stocks_markets:
    print(sm)
    sm_path = os.path.join(featured_path, sm)
    for f in os.listdir(sm_path):
        print(f)
        f_path = os.path.join(sm_path, f)
        df = pd.read_csv(f_path)
        percentages_moves(df, 'Adj Close', 'Adj Close Percent')
        future_price(df, 'Adj Close', 'Future Price')
        df['DayOfWeek'] = pd.to_datetime(df['Date']).dt.dayofweek
        df.to_csv(f_path, index=False)

In [None]:
# add foreign markets features
jse_market_path = sm_path = os.path.join(featured_path, 'jse')

for jse in os.listdir(jse_market_path):
    jse_path = os.path.join(jse_market_path, jse)
    jse_name = Path(jse_path).stem
    
    js_df = pd.read_csv(jse_path)
    
    jse_top_corr_df = top_df[top_df['JSE_STOCK'] == jse_name]    
    
    def add_features(row_tuple, js_df):
        row = row_tuple[1]
        sm = row['TARGET_MARKET']
        tst = row['TARGET_STOCK']
        f_sm_path = os.path.join(featured_path, sm, f'{tst}.csv')
        f_sm_df = pd.read_csv(f_sm_path, usecols=['Date', 'Adj Close Percent'])
        f_sm_df.rename(columns={'Adj Close Percent': f'{sm}_{tst}_Close_percent' }, inplace=True)
        return js_df.merge(f_sm_df, on='Date', how='inner')   
                
    for row in jse_top_corr_df.iterrows():
        js_df = add_features(row, js_df)
    
    js_df.to_csv(jse_path, index=False) 

In [None]:
# add sentiment analysis features
#jse_market_path = sm_path = os.path.join(featured_path, 'jse')

#for jse in os.listdir(jse_market_path):
    #jse_path = os.path.join(jse_market_path, jse)
    #js_df = pd.read_csv(jse_path)
    #js_df.to_csv(jse_path, index=False) 


In [None]:
# create and save model
df = pd.read_csv("C:/_repos/hackaton-turbo/datasets/featured/jse/DCP.JO_2021-09-15_2023-09-15.csv")
X = df.drop(labels=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Future Price'], axis=1)

y = df['Future Price']
tss = TimeSeriesSplit(n_splits = 3)

for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

params = {'colsample_bytree': 0.8585110363891627, 'gamma': 0.218167915803278, 'learning_rate': 0.2706327747080202, 'max_depth': 2, 'n_estimators': 131, 'subsample': 0.8224020154570338, 'objective':"reg:squarederror", 'random_state': 42, 'seed': 233}
#xgb_model = xgb.XGBRegressor(**params)

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, seed=233)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))
plt.rcParams['figure.figsize'] = (5, 50)

xgb.plot_importance(xgb_model)
#xgb_model.save_model("model.json")

In [None]:
# tuning hyperparameters
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

df = pd.read_csv("C:/_repos/hackaton-turbo/datasets/featured/jse/DCP.JO_2021-09-15_2023-09-15.csv")
X = df.drop(labels=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Future Price'], axis=1)

y = df['Future Price']
tss = TimeSeriesSplit(n_splits = 3)

for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

params = {'colsample_bytree': 0.8585110363891627, 'gamma': 0.218167915803278, 'learning_rate': 0.2706327747080202, 'max_depth': 2, 'n_estimators': 131, 'subsample': 0.8224020154570338, 'objective':"reg:squarederror", 'random_state': 42}
xgb_model = xgb.XGBRegressor(params)


params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}



search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X_train, y_train)

report_best_scores(search.cv_results_, 1)