In [1]:
!pip install xgboost
!pip install colorama==0.4.4
!pip install bayesian-optimization==1.4.0
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
from bayes_opt import BayesianOptimization
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score



In [2]:
# Config for the XG Boost Testing

# Where You Store the folder
Folder_Path = "C:\\Users\\anson\\OneDrive\\桌面\\CUQTS\\2023_03_08_FX_Parity\\CUQTS-FX-Parity-Research-Source-Code\\CUQTS_FXParity-main\\Result"

# TimeFrame List & Included Test Currencies
tf_list = ["1M", "3M"]
curr_list = ["USD", "JPY", "GBP", "CHF","AUD", "CAD", "NZD", "NOK", "SEK", "EUR"]
exp_return_threshold = 0 # When exp.return > threshold -> Classified as 1

# ML Metrics - Use as a *kwargs to initialize ML Variable in XGBoost Classifier
model_training_parameters = {
    'learning_rate': 0.2, 
    'gamma': 0.2, 
    'reg_alpha': 0.1
}

# Additional Choices
default_threshold = 0
activate_search_threshold = False
predict_direction = "PPP" # PPP/IRP
drop_fe_list = ["start_date", "end_date", "realized_spot"] # Initial Spots, Realized Spots, You may drop as many as you like (Sample Columns is as below)
time_lag_dict = {'1M':5, '3M':1} # Will loop through shift 1, 2, ....(time_lag) as user inputs
predict_period_after = 1 # Use latest data to predict whether label after kth period will rise.

In [3]:
#reading treasury rate

treasury_dict = {}
G_10 = curr_list
Maturity = tf_list

for currency in G_10:
    for maturity in Maturity:
        
        if currency == 'USD':
            ticker = 'USGG' + maturity + ' Index'
        else:
            ticker = 'GT'+ currency + maturity + ' Govt'
        try:
            treasury_data_df = pd.read_csv('C:\\Users\\anson\\OneDrive\\桌面\\CUQTS\\2023_03_08_FX_Parity\\CUQTS-FX-Parity-Research-Source-Code\\CUQTS_FXParity-main\\Treasury Rate\\' 
                                        + ticker + '.csv', header = 1, index_col = 0)
            treasury_dict[currency+'_'+maturity] = treasury_data_df
        except:
            treasury_dict[currency+'_'+maturity] = None


In [4]:
# Variable Adjustment

# Generated cross product of currency list
curr_pair_1 = curr_list
curr_pair_2 = curr_list
gen = ((x, y) for x in curr_pair_1 for y in curr_pair_2 if x != y)

In [5]:
def feature_engine(result_df, time_lag):
    req_col = result_df.columns
    for col in req_col:
        for i in range(1, time_lag + 1):
            result_df[f"{col}_shift{i}"] = result_df[col].shift(i)
    return result_df

def label_result(result_df, threshold = default_threshold):
    # Here you can add any label as you want...
    result_df["label"] = np.where(result_df["realized_ret"] > threshold, 1, 0)
    result_df["label"] = result_df["label"].shift(-predict_period_after)
    
    return result_df

def get_best_params(result_df):
    result_df = result_df.dropna()
    # set training parameters and label & split the testing set.
    X = result_df.drop(columns = ["label"])
    Y = result_df["label"]
    X_train, X_test, y_train, y_test = train_test_split(X, Y)
    # Optimize the machine learning parameter
    # print(X_train, y_train)
    def xgb_cv(learning_rate, max_depth):
        params = {'learning_rate': learning_rate, 'max_depth': int(max_depth), 'objective': 'binary:logistic'}
        cv_result = xgb.XGBClassifier(**params).fit(X_train, y_train).predict_proba(X_test)[:,1]
        return f1_score(y_test, (cv_result > 0.5).astype(int))
    # Set up the Bayesian optimizer
    pbounds = {'learning_rate': (0.01, 1.0), 'max_depth': (1, 10)}
    optimizer = BayesianOptimization(f=xgb_cv, pbounds=pbounds, random_state=42)

    # Run the optimization loop
    optimizer.maximize(init_points=5, n_iter=10)
    best_params = optimizer.max['params']
    best_params["max_depth"] = int(best_params["max_depth"])
    return best_params

def get_best_classification(result_df):
    result_df = result_df.dropna()
    
    X = result_df
    Y = np.where(result_df["realized_ret"] > default_threshold, 1, 0)
    X_train, X_test, y_train, y_test = train_test_split(X, Y)

    def optimize_cf(threshold):
        param = {'threshold': threshold}
        return f1_score(y_train, (X_train["realized_ret"] > 0).astype(int))
    pbounds = {'threshold': (-0.005, 0.005)}
    
    optimizer = BayesianOptimization(f=optimize_cf, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=5, n_iter=10)
    
    results = optimizer.res
    filtered_results = [result for result in results if result['target'] != 1.0]
    best_params = max(filtered_results, key=lambda x: x['target'])['params']["threshold"]
    # best_params = optimizer.max['params']
    return best_params

def trainXGModel(result_df, best_params):
    result_df = result_df.dropna()
    # set training parameters and label & split the testing set.
    X = result_df.drop(columns = ["label"])
    Y = result_df["label"]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    
    # Fit the model
    model = xgb.XGBClassifier(**model_training_parameters)
    #model = xgb.XGBClassifier(**best_params)
    model.fit(X_train, y_train)
    
    # Get the feature importance object
    tmp_df = pd.DataFrame(model.feature_importances_.reshape(1, -1), columns=X.columns)
    obj = tmp_df.to_dict(orient='records')[0]
    
    # Predict the model 
    y_pred = pd.DataFrame(model.predict(X_test), index=y_test.index)
    accuracy = accuracy_score(y_test, y_pred)
    
    return (accuracy, obj, y_pred)

In [14]:
def feature_importance_study(feature_set, result_df, time_lag):
    
    result_truncated_df = pd.DataFrame(result_df[feature_set])
    result_truncated_df.index = result_df["end_date"]
    result_truncated_df = feature_engine(result_truncated_df, time_lag)
    possible_feature_set = result_truncated_df.columns
    result_truncated_df['label'] = np.where(result_df["realized_ret"] > 0, 1, 0)
    result_truncated_df["label"] = result_truncated_df["label"].shift(-predict_period_after)
    result_truncated_df = result_truncated_df.dropna()
    
    accuracy_dict = {}
    
    for feature in possible_feature_set:
        if feature == 'realized_ret':
            continue
        selected_feature_df = pd.DataFrame(result_truncated_df[[feature,'label']])
        accuracy, fe_obj, y_pred = trainXGModel(selected_feature_df, model_training_parameters)
        accuracy_dict[feature] = accuracy
    
    return accuracy_dict


# Feature Selection

In [15]:
feature_set = ['spread', 'base_Econ', 'pricing_Econ', 'realized_ret']

In [19]:
gen = ((x, y) for x in curr_pair_1 for y in curr_pair_2 if x != y)

accuracy_1M_df = pd.DataFrame()
accuracy_3M_df = pd.DataFrame()

accuracy_1M_dict = {}
accuracy_3M_dict = {}

for c1, c2 in gen:
    for tf in tf_list:
        time_lag = time_lag_dict[tf]
        try:
            # 1. Access file and create df
            
            result_df = pd.read_csv(f'{Folder_Path}\\{predict_direction}\\{c1 + c2}\\Data Details\\{tf}.csv', index_col=0)
            
            if predict_direction == "PPP":
                result_df.index = pd.to_datetime(result_df.index)

                tmp = pd.read_csv(f'C:\\Users\\anson\\OneDrive\\桌面\\CUQTS\\2023_03_08_FX_Parity\\CUQTS-FX-Parity-Research-Source-Code\\CUQTS_FXParity-main\\Data\\CPI\\{c1}.csv', index_col=0, header = 1)
                tmp.index = pd.to_datetime(tmp.index)
                result_df['base_Econ'] = tmp["Last_Price"].resample(tf).last()

                tmp = pd.read_csv(f'C:\\Users\\anson\\OneDrive\\桌面\\CUQTS\\2023_03_08_FX_Parity\\CUQTS-FX-Parity-Research-Source-Code\\CUQTS_FXParity-main\\Data\\CPI\\{c2}.csv', index_col=0, header = 1)
                tmp.index = pd.to_datetime(tmp.index)
                result_df['pricing_Econ'] = tmp["Last_Price"].resample(tf).last()
            
            else:
                result_df.index = pd.to_datetime(result_df['start_date'])
                
                tmp = treasury_dict[c1+'_'+tf]
                result_df['base_Econ'] = tmp["Last_Price"].loc[result_df['start_date']]

                tmp = treasury_dict[c2+'_'+tf]
                result_df['pricing_Econ'] = tmp["Last_Price"].loc[result_df['start_date']]
            
            result_df.index = result_df["end_date"]
            
            for feature in feature_set:
                
                curr_accuracy_dict = feature_importance_study([feature], result_df, time_lag)
                
                
                for key in curr_accuracy_dict.keys():
                    
                    if tf == '1M':
                        if key in accuracy_1M_dict.keys():
                            ls = accuracy_1M_dict[key]
                            ls.append(curr_accuracy_dict[key])
                            accuracy_1M_dict[key] = ls
                        else:
                            ls = [curr_accuracy_dict[key]]
                            accuracy_1M_dict[key] = ls
                    else:
                        if key in accuracy_3M_dict.keys():
                            ls = accuracy_3M_dict[key]
                            ls.append(curr_accuracy_dict[key])
                            accuracy_3M_dict[key] = ls
                        else:
                            ls = [curr_accuracy_dict[key]]
                            accuracy_3M_dict[key] = ls
        except:
            continue
                

                
accuracy_1M_df.index = accuracy_1M_dict.keys()
accuracy_3M_df.index = accuracy_3M_dict.keys()

ls = []

for key in accuracy_1M_dict.keys():
    ls.append(np.mean(accuracy_1M_dict[key]))
    
accuracy_1M_df['accuracy'] = ls

ls = []

for key in accuracy_3M_dict.keys():
    ls.append(np.mean(accuracy_3M_dict[key]))
    
accuracy_3M_df['accuracy'] = ls
                

In [20]:
accuracy_1M_df.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0,accuracy
base_Econ_shift2,0.518902
pricing_Econ_shift1,0.517223
pricing_Econ_shift2,0.517003
base_Econ_shift4,0.51301
base_Econ,0.510666
realized_ret_shift3,0.510346
pricing_Econ_shift4,0.509306
spread_shift2,0.507615
realized_ret_shift1,0.507262
spread_shift3,0.502346


In [21]:
accuracy_3M_df.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0,accuracy
spread_shift1,0.531607
pricing_Econ_shift1,0.525926
base_Econ,0.505905
base_Econ_shift1,0.502236
spread,0.502181
pricing_Econ,0.496296
realized_ret_shift1,0.491358
