## Import Libraries 


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
# import plotly.express as px
# from plotly.subplots import make_subplots
from Standardization import metric_normalizer
from sklearn.model_selection import train_test_split
from IPython.display import display, HTML
pd.set_option('display.max_columns', None)
display(HTML("<style>.container { width:100% !important; }</style>"))

import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline 




from sklearn.metrics import mean_absolute_error


from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor


import ipywidgets as wid
import random

import numpy as np
from IPython.display import display


In [2]:
import sklearn
print(sklearn.__version__)

1.4.2


## Read Data 




In [3]:
df = pd.read_csv('/home/clivence/base_jupyter/Datadump/Model_Data/GU_Model_Data_V1_16k')

# df.head(50)

## Data Cleaning 

In [4]:
#Filter out negative values - evetually need to research why this is happening 
df  = df[(df['2D Low in Pips'] > 0) | (df['2D Low in Pips'] > 0)] 

df = df[df['Action'] == 'Ultimate Action']
df = df[df['Ticker'] == 'GBP/USD']

df.fillna(0, inplace=True)

#Standardize the dataset
df =  metric_normalizer(df)

#Set date to datetime 
df['Date'] = pd.to_datetime(df['Date'])

#Create Date Derived Features 
df['Trade_Week_Year'] = df['Date'].dt.isocalendar().week
df['Trade_Week_Month'] = (df['Date'].dt.day -1)//7+1
df['Trade_Day_Week'] = df['Date'].dt.weekday + 1

#Create a new feature to identify the status of the previous trade 
df['Previous_Trade_Status'] = df['2D Trade Status'].shift(fill_value=0)

### Unwanted for now 

## Preprocessing 


In [21]:
#Subset Selection
X= df[['open', 'high', 'low', 'close', 'volume','Trade_Week_Year','Trade_Week_Month','Trade_Day_Week',
       'Day','Month','25EMA', '50EMA', '75EMA','100EMA', '125EMA',
        'Trend Status', 'Spread','5075 Trend Status', '75100 Trend Status',
       '100125 Trend Status', 'Order Type', 
        '%K', '%D', 'k_group', 'ADX', 'ADXR', 'slowk','slowd','CDL2CROWS', 'CDL3BLACKCROWS',
        'CDL3INSIDE', 'CDL3LINESTRIKE', 'CDL3OUTSIDE',
        'CDL3STARSINSOUTH', 'CDL3WHITESOLDIERS', 'CDLABANDONEDBABY', 'CDLADVANCEBLOCK',
        'CDLBELTHOLD', 'CDLBREAKAWAY', 'CDLCLOSINGMARUBOZU', 'CDLCONCEALBABYSWALL',
        'CDLCOUNTERATTACK', 'CDLDARKCLOUDCOVER', 'CDLDOJI', 'CDLDOJISTAR', 'CDLDRAGONFLYDOJI',
        'CDLENGULFING', 'CDLEVENINGDOJISTAR', 'CDLEVENINGSTAR', 'CDLGAPSIDESIDEWHITE',
        'CDLGRAVESTONEDOJI', 'CDLHAMMER', 'CDLHANGINGMAN', 'CDLHARAMI', 'CDLHARAMICROSS',
        'CDLHIGHWAVE', 'CDLHIKKAKE', 'CDLHIKKAKEMOD', 'CDLHOMINGPIGEON', 'CDLIDENTICAL3CROWS',
        'CDLINNECK', 'CDLINVERTEDHAMMER', 'CDLKICKING', 'CDLKICKINGBYLENGTH', 'CDLLADDERBOTTOM',
        'CDLLONGLEGGEDDOJI', 'CDLLONGLINE', 'CDLMARUBOZU', 'CDLMATCHINGLOW', 'CDLMATHOLD',
        'CDLMORNINGDOJISTAR', 'CDLMORNINGSTAR', 'CDLONNECK', 'CDLPIERCING', 'CDLRICKSHAWMAN',
        'CDLRISEFALL3METHODS', 'CDLSEPARATINGLINES', 'CDLSHOOTINGSTAR', 'CDLSHORTLINE',
        'CDLSPINNINGTOP', 'CDLSTALLEDPATTERN', 'CDLSTICKSANDWICH', 'CDLTAKURI', 'CDLTASUKIGAP',
        'CDLTHRUSTING', 'CDLTRISTAR', 'CDLUNIQUE3RIVER', 'CDLUPSIDEGAP2CROWS', 'CDLXSIDEGAP3METHODS',
        'candle_bullish_score','candle_bearish_score','Previous_Trade_Status']]
Y = df[['2D High in Pips']]


#Create a list of columns to encode 
cat_cols_to_encode = ['Order Type']
#Create a list of cols for ordinal encoding 
cat_cols_for_ordinal_encoding = ['Trend Status','5075 Trend Status', '75100 Trend Status',
                    '100125 Trend Status','k_group']

#Create a list of cols to scale 
num_cols_to_scale = ['volume']

# #View row, cols count
# df.shape ,  X.shape, Y.shape

 #Split the data into training and test set 
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=.204, random_state=42, stratify=X['Order Type'])

print(f"Training Set: {X_train.shape}\
        \nTest Set: {X_test.shape}")

#Create Column Tranformer 
Preprocessor = make_column_transformer(
                                (StandardScaler(), num_cols_to_scale),
                                (OneHotEncoder(drop='first',sparse_output=False),cat_cols_to_encode,{'prefix':'encoded'}),
                                (OrdinalEncoder(), cat_cols_for_ordinal_encoding),
                                remainder='passthrough')

Preprocessor.set_output(transform='pandas')

# X_train_transform = Preprocessor.fit_transform(X_train)

# X_train_transform

Training Set: (384, 92)        
Test Set: (99, 92)


##  Baseline Model 

In [None]:
#Calculate the average of the label feature in the training set 
average_2d_high_pips= y_train['2D High in Pips'].mean()

#Create a list of predictions for the test set using the average value as a baseline 
baseline_model_test_predictions = [average_2d_high_pips] * len(y_test)

#Cacculate the the MAE between the baseline predictions and actual values in the test set 
test_set_mae = mean_absolute_error(baseline_model_test_predictions, y_test)

#Convert actual values to numpy array and reshape it into a 1D array 
actual = np.array(y_test).reshape(-1)

#Convert the baseline model predictions to numpy array and reshape it into a 1D array 
predicted = np.array(baseline_model_test_predictions).reshape(-1)

#Extract the 'Order Type' feature from the test set to numpy array and reshape it into a 1D array 
order_type = np.array(X_test['Order Type']).reshape(-1)

#Calculate the error(residuals) between the actuals and the base model predictions 
errors = actual - predicted

print(f'Base Model Prediction: {average_2d_high_pips} \nBase Model MAE: {test_set_mae}')

In [None]:
# Determine unique order types and assign colors
unique_order_types = np.unique(order_type)
colors = plt.cm.jet(np.linspace(0, 1, len(unique_order_types)))
color_dict = dict(zip(unique_order_types, colors))

def plot_filtered_data(selected_order_type='All'):
    fig, axs = plt.subplots(figsize=(16, 9))

    if selected_order_type == 'All':
        for order in unique_order_types:
            idx = order_type == order
            axs.scatter(actual[idx], predicted[idx], alpha=0.5, color=color_dict[order], label=order)
            axs.vlines(actual[idx], predicted[idx], actual[idx], color=color_dict[order], alpha=0.7, linewidth=0.5)
    else:
        idx = order_type == selected_order_type
        axs.scatter(actual[idx], predicted[idx], alpha=0.5, color=color_dict[selected_order_type], label=selected_order_type)
        axs.vlines(actual[idx], predicted[idx], actual[idx], color=color_dict[selected_order_type], alpha=0.7, linewidth=0.5)

    axs.set_xlabel('Actual Values', fontsize=14, color='blue')
    axs.set_ylabel('Predicted Values', fontsize=14, color='blue')
    axs.tick_params(axis='both', which='major', labelsize=12, colors='green')
    axs.set_title('Model Prediction vs Actuals with Error Visualization', fontsize=16, color='purple')

    # Errors plotted on a secondary y-axis
    ax3 = axs.twinx()
    ax3.set_ylabel('Error', color='red', fontsize=14)
    ax3.tick_params(axis='y', labelcolor='red', labelsize=12)
    if selected_order_type != 'All':
        error_idx = idx.reshape(-1)  # Reshape idx to match the length of errors
        ax3.scatter(actual[error_idx], errors[error_idx], color='red', alpha=0.5, label='Error')
    axs.legend(loc='upper left', fontsize=12)

    fig.patch.set_facecolor('white')
    plt.tight_layout()
    plt.show()

# Widget setup
dropdown_order_type = wid.Dropdown(options=['All'] + list(unique_order_types), value='All', description='Order Type:')
wid.interactive(plot_filtered_data, selected_order_type=dropdown_order_type)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have the data: actual, predicted, errors, and order_type
# Create the DataFrame
Model_Results = pd.DataFrame({'Actuals': actual,
                   'order_type': order_type,
                   'Base_Model_Predictions': predicted,
                   'error': errors,
                   })

# Define the desired order of order types
order_type_order = ['BUY', 'SELL']  # Add more order types as needed

# Get unique order types and sort them according to the defined order
unique_order_types = sorted(Model_Results['order_type'].unique(), key=lambda x: order_type_order.index(x))

# Create plot figure
num_order_types = len(unique_order_types)
fig, ax = plt.subplots(num_order_types, 1, figsize=(20, 6*num_order_types))

# Plot each order type in a subplot
for i, order_type in enumerate(unique_order_types):
    df_subset = Model_Results[Model_Results['order_type'] == order_type]
    ax[i].plot(df_subset['Actuals'], label='Actuals',linewidth=2,marker='o')
    ax[i].plot(df_subset['Base_Model_Predictions'], label='Base_Model_Predictions')
    ax[i].set_title(f'{order_type} Trades: 2D High in Pips vs Baseline Predictions')
    ax[i].set_xlabel('2D High in Pips')
    ax[i].set_ylabel('2D High in Pips')
    ax[i].legend()

# Add overall title
fig.suptitle('Order Types: 2D High in Pips vs Baseline Predictions', fontsize=18)

plt.tight_layout()
plt.show()


In [None]:
#Group by order type then plot 
group_df = Model_Results.groupby('order_type').mean().reset_index()

#Create plot figure 
fig, axs = plt.subplots(1,2, figsize=(20,10))

#Plot first subplot: order_type by actuals 
axs[0].plot(group_df['order_type'], group_df['Actuals'])
axs[0].plot(group_df['order_type'], group_df['Base_Model_Predictions'])
axs[0].set_xlabel('Order Type')
axs[0].set_ylabel('2D High In Pips')
axs[0].set_title('Order Type by Actuals and Predictions')

#Plot first subplot: order_type by actuals 
axs[1].bar(group_df['order_type'], group_df['error'])
axs[1].set_xlabel('Order Type')
axs[1].set_ylabel('2D High In Pips')
axs[1].set_title('Error by Order Type')

#Add data labels to figure 2
for i, txt in enumerate(group_df['error']):
    axs[1].annotate(f' {txt}', (group_df['order_type'][i], group_df['error'][i]), textcoords='offset points', xytext=(0,10),ha='center')

## Base Linear Regression Model

In [22]:
#Create Pieline to be used in Model
Base_LinReg_Model_Pipeline = Pipeline(steps=[
    ('Preprocessor', Preprocessor),
    ('LinReg_Model',LinearRegression())])

#View Pipeline 
# Base_RF_Model_Pipeline

#Fit the training data 
Base_LinReg_Model_Pipeline.fit(X_train, y_train)

#Make Predictions on the test set 
Base_LinReg_Model_Predictions = Base_LinReg_Model_Pipeline.predict(X_test)

#Get Error Metrics of Random Forest Model
LinReg_Model_MAE = mean_absolute_error(y_test, Base_LinReg_Model_Predictions)

#### Linear Regression Feature Importance 

In [53]:
# get pipeline feature names
Features_Names = Base_LinReg_Model_Pipeline[:-1].get_feature_names_out()

#Get coeficient 
Coefficients = lin_reg_model.coef_

# Create Dataframe
Coefficients_DF = pd.DataFrame({'Feature': Features_Names, 'Coefficient': Coefficients[0]})
# Set display options
pd.set_option('display.float_format', '{:.2f}'.format)  # Format to 2 decimal places
Coefficients_DF = Coefficients_DF.sort_values(by='Coefficient', ascending=False)

Coefficients_DF.head(5)

Unnamed: 0,Feature,Coefficient
5,ordinalencoder__100125 Trend Status,13162531387529.45
1,onehotencoder__Order Type_SELL,9916017659509.9
43,remainder__CDLDOJI,7369446232371.96
70,remainder__CDLMORNINGDOJISTAR,3645260902277.39
80,remainder__CDLSTALLEDPATTERN,3645260902276.63


In [None]:
potential_noisy_features = Coefficients_Df[(Coefficients_Df['Coefficient'] < 5) and (Coefficients_Df['Coefficient'] > -5)]


In [None]:
#Create a dictionary to store MAE for comparison 
MAE_Dict = {
    'Metrics' : ['Base Model Mae','LinReg Base Model MAE',''],
    'Value':[round(float(test_set_mae),2), round(float(LinReg_Model_MAE)), round(float())]
}

MAE_DF = pd.DataFrame(MAE_Dict)

MAE_DF = MAE_DF.sort_values('Value', ascending=False)

# MAE_DF.plot()
fig, ax = plt.subplots(figsize=(12,6))
# Now you can use ax to plot
MAE_DF.plot(kind='line', x='Metrics', y='Value', ax=ax,marker='o')
ax.set_title('Linear Regression Model MAE Comparison')
ax.set_ylabel('MAE Value')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Add New Model Predictions & Errors to Model Result_df 
Model_Results = Model_Results.assign(Base_LinReg_Model_Predictions=Base_LinReg_Model_Predictions,
                                     LinReg_Model_MAE=LinReg_Model_MAE)


Model_Results.head(20)

unique_order_types = Model_Results['order_type'].unique()

# Create plot figure
num_order_types = len(unique_order_types)
fig, ax = plt.subplots(num_order_types, 1, figsize=(20, 6*num_order_types))

# Plot each order type in a subplot
for i, order_type in enumerate(unique_order_types):
    df_subset = Model_Results[Model_Results['order_type'] == order_type]
    ax[i].plot(df_subset['Actuals'], label='Actuals')
    ax[i].plot(df_subset['Base_LinReg_Model_Predictions'], label='Base_LinReg_Model_Predictions')
    ax[i].plot(df_subset['Base_Model_Predictions'], label='Base_Model_Predictions')
    ax[i].set_title(f'{order_type} Trades: 2D High in Pips vs Random Forest Model Predictions')
    ax[i].set_xlabel('2D High in Pips')
    ax[i].set_ylabel('2D High in Pips')
    ax[i].legend()

# Add overall title
fig.suptitle('Order Types: 2D High in Pips vs Baseline Model Predictions', fontsize=18)

plt.tight_layout()
plt.show()


In [None]:
#Group by order type then plot 
group_df = Model_Results.groupby('order_type').mean().reset_index()

#Create plot figure 
fig, axs = plt.subplots(1,2, figsize=(20,10))

#Plot first subplot: order_type by actuals 
axs[0].plot(group_df['order_type'], group_df['Actuals'])
axs[0].plot(group_df['order_type'], group_df['Base_LinReg_Model_Predictions'])
axs[0].set_xlabel('Order Type')
axs[0].set_ylabel('2D High In Pips')
axs[0].set_title('Order Type by Actuals and Predictions')

#Plot first subplot: order_type by actuals 
axs[1].bar(group_df['order_type'], group_df['error'])
axs[1].set_xlabel('Order Type')
axs[1].set_ylabel('2D High In Pips')
axs[1].set_title('Error by Order Type')

#Add data labels to figure 2
for i, txt in enumerate(group_df['error']):
    axs[1].annotate(f' {txt}', (group_df['order_type'][i], group_df['error'][i]), textcoords='offset points', xytext=(0,10),ha='center')

## Base Random Forest Model 

In [59]:
Preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), num_cols_to_scale),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False), cat_cols_to_encode),
        ('ordinal', OrdinalEncoder(), cat_cols_for_ordinal_encoding)
    ],
    remainder='passthrough'
)


In [60]:
#Create Pieline to be used in Model
Base_RF_Model_Pipeline = Pipeline(steps=[
    ('Preprocessor', Preprocessor),
    ('RF_Model',RandomForestRegressor())])

#View Pipeline 
# Base_RF_Model_Pipeline

#Fit the training data 
Base_RF_Model_Pipeline.fit(X_train, y_train)

#Make Predictions on the test set 
Base_RF_Model_Predictions = Base_RF_Model_Pipeline.predict(X_test)

#Get Error Metrics of Random Forest Model
RF_Model_MAE = mean_absolute_error(y_test, Base_RF_Model_Predictions)

  return fit_method(estimator, *args, **kwargs)


In [70]:
features = Base_LinReg_Model_Pipeline[:-1].get_feature_names_out()

importances = Base_RF_Model_Pipeline.named_steps['RF_Model'].feature_importances_

feature_importance = pd.DataFrame({'Feature Name': features, 'Importance': importances})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
feature_importance.tail(10)


Unnamed: 0,Feature Name,Importance
60,remainder__CDLINNECK,0.0
61,remainder__CDLINVERTEDHAMMER,0.0
62,remainder__CDLKICKING,0.0
63,remainder__CDLKICKINGBYLENGTH,0.0
38,remainder__CDLBREAKAWAY,0.0
68,remainder__CDLMATCHINGLOW,0.0
47,remainder__CDLEVENINGDOJISTAR,0.0
29,remainder__CDL3BLACKCROWS,0.0
42,remainder__CDLDARKCLOUDCOVER,0.0
64,remainder__CDLLADDERBOTTOM,0.0


In [None]:
#Create a dictionary to store MAE for comparison 
MAE_Dict = {
    'Metrics' : ['Base Model Mae','LinReg Base Model MAE','RF Base Model MAE'],
    'Value':[round(float(test_set_mae),2),round(float(RF_Model_MAE)), round(float(RF_Model_MAE),2)]
}

MAE_DF = pd.DataFrame(MAE_Dict)

MAE_DF = MAE_DF.sort_values('Value', ascending=False)

# MAE_DF.plot()
fig, ax = plt.subplots(figsize=(12,6))
# Now you can use ax to plot
MAE_DF.plot(kind='line', x='Metrics', y='Value', ax=ax,marker='o')
ax.set_title('Model MAE Comparison')
ax.set_ylabel('MAE Value')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Add New Model Predictions & Errors to Model Result_df 
Model_Results = Model_Results.assign(Base_RF_Model_Predictions=Base_RF_Model_Predictions,
                                     Base_RF_Model_Error=Base_RF_Model_Error)

unique_order_types = Model_Results['order_type'].unique()

# Create plot figure
num_order_types = len(unique_order_types)
fig, ax = plt.subplots(num_order_types, 1, figsize=(20, 6*num_order_types))

# Plot each order type in a subplot
for i, order_type in enumerate(unique_order_types):
    df_subset = Model_Results[Model_Results['order_type'] == order_type]
    ax[i].plot(df_subset['Actuals'], label='Actuals')
    ax[i].plot(df_subset['Base_RF_Model_Predictions'], label='Base_RF_Model_Predictions')
    ax[i].plot(df_subset['Base_Model_Predictions'], label='Base_Model_Predictions')
    ax[i].set_title(f'{order_type} Trades: 2D High in Pips vs Random Forest Model Predictions')
    ax[i].set_xlabel('2D High in Pips')
    ax[i].set_ylabel('2D High in Pips')
    ax[i].legend()

# Add overall title
fig.suptitle('Order Types: 2D High in Pips vs Baseline Predictions', fontsize=18)

plt.tight_layout()
plt.show()


In [None]:
#Group by order type then plot 
group_df = Model_Results.groupby('order_type').mean().reset_index()

#Create plot figure 
fig, axs = plt.subplots(1,2, figsize=(20,10))

#Plot first subplot: order_type by actuals 
axs[0].plot(group_df['order_type'], group_df['Actuals'])
axs[0].plot(group_df['order_type'], group_df['Base_RF_Model_Predictions'])
axs[0].set_xlabel('Order Type')
axs[0].set_ylabel('2D High In Pips')
axs[0].set_title('Order Type by Actuals and Predictions')

#Plot first subplot: order_type by actuals 
axs[1].bar(group_df['order_type'], group_df['Base_RF_Model_Error'])
axs[1].set_xlabel('Order Type')
axs[1].set_ylabel('2D High In Pips')
axs[1].set_title('Error by Order Type')

#Add data labels to figure 2
for i, txt in enumerate(group_df['error']):
    axs[1].annotate(f' {txt}', (group_df['order_type'][i], group_df['error'][i]), textcoords='offset points', xytext=(0,10),ha='center')

## Base Gradient Boosting Model

In [None]:
#Create Pieline to be used in Model
Base_GBR_Model_Pipeline = Pipeline(steps=[
    ('Preprocessor', Preprocessor),
    ('GBR_Model',GradientBoostingRegressor(learning_rate=0.01,
    n_estimators=100,max_depth=5))])

#View Pipeline 
# Base_RF_Model_Pipeline

#Fit the training data 
Base_GBR_Model_Pipeline.fit(X_train, y_train)

#Make Predictions on the test set 
Base_GBR_Model_Predictions = Base_GBR_Model_Pipeline.predict(X_test)

#Get Error Metrics of Random Forest Model
GBR_Model_MAE = mean_absolute_error(y_test, Base_GBR_Model_Predictions)

In [None]:
??GradientBoostingRegressor

In [None]:
#Create a dictionary to store MAE for comparison 
MAE_Dict = {
    'Metrics' : ['Base Model Mae','LinReg Base Model MAE','RF Base Model MAE','GBR Base Model MAE'],
    'Value':[round(float(test_set_mae),2),round(float(RF_Model_MAE)), round(float(RF_Model_MAE),2),
             round(float(GBR_Model_MAE))]
}

MAE_DF = pd.DataFrame(MAE_Dict)

MAE_DF = MAE_DF.sort_values('Value', ascending=False)

# MAE_DF.plot()
fig, ax = plt.subplots(figsize=(12,6))
# Now you can use ax to plot
MAE_DF.plot(kind='line', x='Metrics', y='Value', ax=ax,marker='o')
ax.set_title('Base Model MAE Comparison')
ax.set_ylabel('MAE Value')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Add New Model Predictions & Errors to Model Result_df 
Model_Results = Model_Results.assign(Base_GBR_Model_Predictions=Base_GBR_Model_Predictions,
                                     GBR_Model_MAE=GBR_Model_MAE)


Model_Results.head(20)

unique_order_types = Model_Results['order_type'].unique()

# Create plot figure
num_order_types = len(unique_order_types)
fig, ax = plt.subplots(num_order_types, 1, figsize=(20, 6*num_order_types))

# Plot each order type in a subplot
for i, order_type in enumerate(unique_order_types):
    df_subset = Model_Results[Model_Results['order_type'] == order_type]
    ax[i].plot(df_subset['Actuals'], label='Actuals')
    ax[i].plot(df_subset['Base_LinReg_Model_Predictions'], label='Base_LinReg_Model_Predictions')
    ax[i].plot(df_subset['Base_Model_Predictions'], label='Base_Model_Predictions')
    ax[i].plot(df_subset['Base_GBR_Model_Predictions'], label='Base_GBR_Model_Predictions')
    ax[i].set_title(f'{order_type} Trades: 2D High in Pips vs Random Forest Model Predictions')
    ax[i].set_xlabel('2D High in Pips')
    ax[i].set_ylabel('2D High in Pips')
    ax[i].legend()

# Add overall title
fig.suptitle('Order Types: 2D High in Pips vs Baseline Model Predictions', fontsize=18)

plt.tight_layout()
plt.show()


In [None]:
#Group by order type then plot 
group_df = Model_Results.groupby('order_type').mean().reset_index()

#Create plot figure 
fig, axs = plt.subplots(1,2, figsize=(20,10))

#Plot first subplot: order_type by actuals 
axs[0].plot(group_df['order_type'], group_df['Actuals'])
axs[0].plot(group_df['order_type'], group_df['Base_GBR_Model_Predictions'])
axs[0].set_xlabel('Order Type')
axs[0].set_ylabel('2D High In Pips')
axs[0].set_title('Order Type by Actuals and Predictions')

#Plot first subplot: order_type by actuals 
axs[1].bar(group_df['order_type'], group_df['GBR_Model_MAE'])
axs[1].set_xlabel('Order Type')
axs[1].set_ylabel('2D High In Pips')
axs[1].set_title('Error by Order Type')

#Add data labels to figure 2
for i, txt in enumerate(group_df['error']):
    axs[1].annotate(f' {txt}', (group_df['order_type'][i], group_df['error'][i]), textcoords='offset points', xytext=(0,10),ha='center')

## Base Decision Tree Model

In [None]:
#Create Pieline to be used in Model
Base_DT_Model_Pipeline = Pipeline(steps=[
    ('Preprocessor', Preprocessor),
    ('GBR_Model',DecisionTreeRegressor(max_depth=None,
    min_samples_split=5,
    min_samples_leaf=1,max_features=10))])

#View Pipeline 
# Base_RF_Model_Pipeline

#Fit the training data 
Base_DT_Model_Pipeline.fit(X_train, y_train)

#Make Predictions on the test set 
Base_DT_Model_Predictions = Base_DT_Model_Pipeline.predict(X_test)

#Get Error Metrics of Random Forest Model
DT_Model_MAE = mean_absolute_error(y_test, Base_DT_Model_Predictions)

In [None]:
#Create a dictionary to store MAE for comparison 
MAE_Dict = {
    'Metrics' : ['Base Model Mae','LinReg Base Model MAE','RF Base Model MAE','GBR Base Model MAE','DT Base Model MAE'],
    'Value':[round(float(test_set_mae),2),round(float(RF_Model_MAE)), round(float(RF_Model_MAE),2),
             round(float(GBR_Model_MAE)),round(float(DT_Model_MAE))]
}

MAE_DF = pd.DataFrame(MAE_Dict)

MAE_DF = MAE_DF.sort_values('Value', ascending=False)

# MAE_DF.plot()
fig, ax = plt.subplots(figsize=(12,6))
# Now you can use ax to plot
MAE_DF.plot(kind='line', x='Metrics', y='Value', ax=ax,marker='o')
ax.set_title('Base Model MAE Comparison')
ax.set_ylabel('MAE Value')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Add New Model Predictions & Errors to Model Result_df 
Model_Results = Model_Results.assign(Base_DT_Model_Predictions=Base_DT_Model_Predictions,
                                     DT_Model_MAE=DT_Model_MAE)


Model_Results.head(20)

unique_order_types = Model_Results['order_type'].unique()

# Create plot figure
num_order_types = len(unique_order_types)
fig, ax = plt.subplots(num_order_types, 1, figsize=(20, 6*num_order_types))

# Plot each order type in a subplot
for i, order_type in enumerate(unique_order_types):
    df_subset = Model_Results[Model_Results['order_type'] == order_type]
    ax[i].plot(df_subset['Actuals'], label='Actuals')
    ax[i].plot(df_subset['Base_LinReg_Model_Predictions'], label='Base_LinReg_Model_Predictions')
    ax[i].plot(df_subset['Base_Model_Predictions'], label='Base_Model_Predictions')
    ax[i].plot(df_subset['Base_GBR_Model_Predictions'], label='Base_GBR_Model_Predictions')
    ax[i].plot(df_subset['Base_DT_Model_Predictions'], label='Base_DT_Model_Predictions')
    ax[i].set_title(f'{order_type} Trades: 2D High in Pips vs Random Forest Model Predictions')
    ax[i].set_xlabel('2D High in Pips')
    ax[i].set_ylabel('2D High in Pips')
    ax[i].legend()

# Add overall title
fig.suptitle('Order Types: 2D High in Pips vs Baseline Model Predictions', fontsize=18)

plt.tight_layout()
plt.show()


In [None]:
#Group by order type then plot 
group_df = Model_Results.groupby('order_type').mean().reset_index()

#Create plot figure 
fig, axs = plt.subplots(1,2, figsize=(20,10))

#Plot first subplot: order_type by actuals 
axs[0].plot(group_df['order_type'], group_df['Actuals'])
axs[0].plot(group_df['order_type'], group_df['Base_DT_Model_Predictions'])
axs[0].set_xlabel('Order Type')
axs[0].set_ylabel('2D High In Pips')
axs[0].set_title('Order Type by Actuals and Predictions')

#Plot first subplot: order_type by actuals 
axs[1].bar(group_df['order_type'], group_df['DT_Model_MAE'])
axs[1].set_xlabel('Order Type')
axs[1].set_ylabel('2D High In Pips')
axs[1].set_title('Error by Order Type')

#Add data labels to figure 2
for i, txt in enumerate(group_df['error']):
    axs[1].annotate(f' {txt}', (group_df['order_type'][i], group_df['error'][i]), textcoords='offset points', xytext=(0,10),ha='center')

## Base KNN Regression Model

In [None]:
#Create Pieline to be used in Model
Base_KNNR_Model_Pipeline = Pipeline(steps=[
    ('Preprocessor', Preprocessor),
    ('KNNR_Model',KNeighborsRegressor(n_neighbors=10))])

#View Pipeline 
# Base_RF_Model_Pipeline

#Fit the training data 
Base_KNNR_Model_Pipeline.fit(X_train, y_train)

#Make Predictions on the test set 
Base_KNNR_Model_Predictions = Base_KNNR_Model_Pipeline.predict(X_test)

#Get Error Metrics of Random Forest Model
KNNR_Model_MAE = mean_absolute_error(y_test, Base_KNNR_Model_Predictions)

In [None]:
#Create a dictionary to store MAE for comparison 
MAE_Dict = {
    'Metrics' : ['Base Model Mae','LinReg Base Model MAE','RF Base Model MAE','GBR Base Model MAE',
                 'DT Base Model MAE','KNNR Base Model MAE'],
    'Value':[round(float(test_set_mae),2),round(float(RF_Model_MAE)), round(float(RF_Model_MAE),2),
             round(float(GBR_Model_MAE)),round(float(DT_Model_MAE)),round(float(KNNR_Model_MAE))]
}

MAE_DF = pd.DataFrame(MAE_Dict)

MAE_DF = MAE_DF.sort_values('Value', ascending=False)

# MAE_DF.plot()
fig, ax = plt.subplots(figsize=(12,6))
# Now you can use ax to plot
MAE_DF.plot(kind='line', x='Metrics', y='Value', ax=ax,marker='o')
ax.set_title('Base Model MAE Comparison')
ax.set_ylabel('MAE Value')
plt.show()

In [None]:
#Add New Model Predictions & Errors to Model Result_df 
Model_Results = Model_Results.assign(Base_KNNR_Model_Predictions=Base_KNNR_Model_Predictions,
                                     KNNR_Model_MAE=KNNR_Model_MAE)


Model_Results.head(20)

unique_order_types = Model_Results['order_type'].unique()

# Create plot figure
num_order_types = len(unique_order_types)
fig, ax = plt.subplots(num_order_types, 1, figsize=(20, 6*num_order_types))

# Plot each order type in a subplot
for i, order_type in enumerate(unique_order_types):
    df_subset = Model_Results[Model_Results['order_type'] == order_type]
    ax[i].plot(df_subset['Actuals'], label='Actuals',marker='o')
    ax[i].plot(df_subset['Base_LinReg_Model_Predictions'], label='Base_LinReg_Model_Predictions')
    ax[i].plot(df_subset['Base_Model_Predictions'], label='Base_Model_Predictions')
    ax[i].plot(df_subset['Base_GBR_Model_Predictions'], label='Base_GBR_Model_Predictions',marker='*')
    ax[i].plot(df_subset['Base_DT_Model_Predictions'], label='Base_DT_Model_Predictions')
    ax[i].plot(df_subset['Base_KNNR_Model_Predictions'], label='Base_KNNR_Model_Predictions')

    ax[i].set_title(f'{order_type} Trades: 2D High in Pips vs Random Forest Model Predictions')
    ax[i].set_xlabel('2D High in Pips')
    ax[i].set_ylabel('2D High in Pips')
    ax[i].legend()

# Add overall title
fig.suptitle('Order Types: 2D High in Pips vs Baseline Model Predictions', fontsize=18)

plt.tight_layout()
plt.show()


In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
??plt

# Follow Everything From Tutorial

In [None]:
df_model = df_model.sample(frac=1, random_state=42)
train_df = df_model[:386]
train_df = train_df.reset_index(drop=True)

test_df = df_model[386:]
test_df = test_df.reset_index(drop=True)

In [None]:
y_train = train_df['remainder__2D High in Pips'].to_numpy()
y_test = test_df['remainder__2D High in Pips'].to_numpy()

y_train.shape, y_test.shape

In [None]:

from sklearn.metrics import mean_absolute_error

average_median_house_value = train_df['remainder__2D High in Pips'].mean()
baseline_model_test_predictions = [average_median_house_value] * len(test_df)

mean_absolute_error(baseline_model_test_predictions, y_test)

In [None]:

import matplotlib.pyplot as plt
import numpy as np

# Sample data - replace with your actual and predicted values
actual = np.array(y_test)  # Assuming y_test is your actual values
predicted = np.array(baseline_model_test_predictions)  # Your predicted values
errors = actual - predicted  # Calculate the errors

# Create the primary plot with scatter points
fig, ax1 = plt.subplots()

# Scatter plot for actual vs. predicted values
ax1.scatter(actual, predicted, alpha=0.5, color='blue', label='Predicted vs. Actual')
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Create the secondary y-axis for errors
ax2 = ax1.twinx()  
ax2.set_ylabel('Error', color='red')  
ax2.tick_params(axis='y', labelcolor='red')

# Adding vertical lines for each point to represent the error
for i in range(len(actual)):
    plt.vlines(actual[i], predicted[i], actual[i], color='red', alpha=0.7, linewidth=0.5)

# Optional: Add legend
ax1.legend()

plt.title('Base Model Predictions vs Actuals with Error Visualization')
plt.show()


### PCA

In [None]:
train_df[['onehotencoder__Trend Status_Bullish',
 'onehotencoder__5075 Trend Status_Bullish',
 'onehotencoder__75100 Trend Status_Bullish',
 'onehotencoder__100125 Trend Status_Bullish',
 'onehotencoder__Opportunity_Unfavorable',
 'onehotencoder__Order Type_SELL',
 'onehotencoder__k_group_Oversold',
 'onehotencoder__k_group_Undersold',
 'onehotencoder__k_group_slightly_oversold',
 'remainder__open',
 'remainder__high',
 'remainder__low',
 'remainder__close',
 'remainder__volume',
 'remainder__Trade_Week_Year',
 'remainder__Trade_Week_Month',
 'remainder__Trade_Day_Week',
 'remainder__Day',
 'remainder__Month',
 'remainder__25EMA',
 'remainder__50EMA',
 'remainder__75EMA',
 'remainder__100EMA',
 'remainder__125EMA',
 'remainder__Spread',
 'remainder__%K',
 'remainder__%D',
 'remainder__ADX',
 'remainder__ADXR',
 'remainder__slowk',
 'remainder__slowd',
'remainder__candle_bullish_score',
 'remainder__candle_bearish_score',
'remainder__Previous_Trade_Status']].corr()

In [None]:
X_train_34 = train_df[['onehotencoder__Trend Status_Bullish',
 'onehotencoder__5075 Trend Status_Bullish',
 'onehotencoder__75100 Trend Status_Bullish',
 'onehotencoder__100125 Trend Status_Bullish',
 'onehotencoder__Opportunity_Unfavorable',
 'onehotencoder__Order Type_SELL',
 'onehotencoder__k_group_Oversold',
 'onehotencoder__k_group_Undersold',
 'onehotencoder__k_group_slightly_oversold',
 'remainder__open',
 'remainder__high',
 'remainder__low',
 'remainder__close',
 'remainder__volume',
 'remainder__Trade_Week_Year',
 'remainder__Trade_Week_Month',
 'remainder__Trade_Day_Week',
 'remainder__Day',
 'remainder__Month',
 'remainder__25EMA',
 'remainder__50EMA',
 'remainder__75EMA',
 'remainder__100EMA',
 'remainder__125EMA',
 'remainder__Spread',
 'remainder__%K',
 'remainder__%D',
 'remainder__ADX',
 'remainder__ADXR',
 'remainder__slowk',
 'remainder__slowd',
'remainder__candle_bullish_score',
 'remainder__candle_bearish_score',
 'remainder__Previous_Trade_Status']].to_numpy()

X_train_34.shape

In [None]:
X_test_34 = test_df[['onehotencoder__Trend Status_Bullish',
 'onehotencoder__5075 Trend Status_Bullish',
 'onehotencoder__75100 Trend Status_Bullish',
 'onehotencoder__100125 Trend Status_Bullish',
 'onehotencoder__Opportunity_Unfavorable',
 'onehotencoder__Order Type_SELL',
 'onehotencoder__k_group_Oversold',
 'onehotencoder__k_group_Undersold',
 'onehotencoder__k_group_slightly_oversold',
 'remainder__open',
 'remainder__high',
 'remainder__low',
 'remainder__close',
 'remainder__volume',
 'remainder__Trade_Week_Year',
 'remainder__Trade_Week_Month',
 'remainder__Trade_Day_Week',
 'remainder__Day',
 'remainder__Month',
 'remainder__25EMA',
 'remainder__50EMA',
 'remainder__75EMA',
 'remainder__100EMA',
 'remainder__125EMA',
 'remainder__Spread',
 'remainder__%K',
 'remainder__%D',
 'remainder__ADX',
 'remainder__ADXR',
 'remainder__slowk',
 'remainder__slowd',
'remainder__candle_bullish_score',
 'remainder__candle_bearish_score',
'remainder__Previous_Trade_Status']].to_numpy()

X_test_34.shape

In [None]:

forest_base = RandomForestRegressor(n_estimators=100, max_depth=5).fit(X_train_34, y_train)
forest_base_test_predictions = forest_base.predict(X_test_34)
mean_absolute_error(y_test, forest_base_test_predictions)

In [None]:
# Plotting scatter plot
plt.scatter(y_test, forest_base_test_predictions, alpha=0.5)
plt.title('Scatter Plot of Predictions vs Actuals')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
# Sample data - replace with your actual and predicted values
actual = np.array(y_test)  # Assuming y_test is your actual values
predicted = np.array(forest_base_test_predictions)  # Your predicted values
errors = actual - predicted  # Calculate the errors

# Create the primary plot with scatter points
fig, ax1 = plt.subplots()

# Scatter plot for actual vs. predicted values
ax1.scatter(actual, predicted, alpha=0.5, color='blue', label='Predicted vs. Actual')
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Create the secondary y-axis for errors
ax2 = ax1.twinx()  
ax2.set_ylabel('Error', color='red')  
ax2.tick_params(axis='y', labelcolor='red')

# Adding vertical lines for each point to represent the error
for i in range(len(actual)):
    plt.vlines(actual[i], predicted[i], actual[i], color='red', alpha=0.7, linewidth=0.5)

# Optional: Add legend
ax1.legend()

plt.title('Random Forest Model Predictions vs Actuals with Error Visualization')
plt.show()

In [None]:
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline 


In [None]:
pca = PCA(n_components=32)
pca = pca.fit(X_train_34)

pca

In [None]:
X_train_pca = pca.transform(X_train_34)
X_test_pca = pca.transform(X_test_34)

X_train_pca.shape, X_test_pca.shape

In [None]:
# Get error of Random Forest training on (X_train_pca, y_train) and testing on (X_test_pca, y_test)

forest_pca = RandomForestRegressor(n_estimators=100, max_depth=5).fit(X_train_pca, y_train)
forest_pca_test_predictions = forest_pca.predict(X_test_pca)

mean_absolute_error(y_test, forest_pca_test_predictions)

In [None]:
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

In [None]:
#Test Different Preprocessing Functions on Subset 
Scaler = Normalizer().fit(X_train_34)

X_train_34_scaled = Scaler.transform(X_train_34)

plt.hist(X_train_34_scaled[:,8])

In [None]:
#Preprocess X_test 
X_test_34_scaled = Scaler.transform(X_test_34)

X_test_34_scaled.shape


In [None]:
#Get error for RF Model training with scaled data and testing on scaled data 

RF_Model_Scaled = RandomForestRegressor(n_estimators=100, max_depth=5).fit(X_train_34_scaled, y_train)

RF_Model_Scaled_Predictions =  RF_Model_Scaled.predict(X_test_34_scaled)

mean_absolute_error(y_test, RF_Model_Scaled_Predictions)

In [None]:
#Create a Pipeline of Standardization, PCA, Random Forest 


Scaled_PCA_Pipeline_RF = Pipeline(
                                steps=[('Scaler', Normalizer()),
                                       ('PCA', PCA(n_components=32)),
                                       ('RF', RandomForestRegressor(n_estimators=100, max_depth=5))])

Scaled_PCA_Pipeline_RF

In [None]:
Scaled_PCA_Pipeline_RF.fit(X_train_34, y_train)

Scaled_PCA_Pipeline_RF_Predictions = Scaled_PCA_Pipeline_RF.predict(X_test_34)

mean_absolute_error(y_test, Scaled_PCA_Pipeline_RF_Predictions)

In [None]:
#Create a Pipeline of Standardization, PCA, Random Forest 

from sklearn.pipeline import Pipeline 

Scaled_PCA_Pipeline_RF = Pipeline(
                                steps=[('Encoder', )
                                       ('Scaler', Normalizer()),
                                       ('PCA', PCA(n_components=32)),
                                       ('RF', RandomForestRegressor(n_estimators=100, max_depth=5))])

Scaled_PCA_Pipeline_RF

# Build a Model Based on the Encoded Features  

In [None]:
X_train_Dummies = train_df[['onehotencoder__Time_02:00:00', 'onehotencoder__Time_05:00:00',
       'onehotencoder__Time_06:00:00', 'onehotencoder__Time_09:00:00',
       'onehotencoder__Time_10:00:00', 'onehotencoder__Time_13:00:00',
       'onehotencoder__Time_14:00:00', 'onehotencoder__Time_17:00:00',
       'onehotencoder__Time_18:00:00', 'onehotencoder__Time_21:00:00',
       'onehotencoder__Time_22:00:00', 'onehotencoder__Trend Status_Bullish',
       'onehotencoder__5075 Trend Status_Bullish',
       'onehotencoder__75100 Trend Status_Bullish',
       'onehotencoder__100125 Trend Status_Bullish',
       'onehotencoder__Opportunity_Unfavorable',
       'onehotencoder__Order Type_SELL', 'onehotencoder__k_group_Oversold',
       'onehotencoder__k_group_Undersold',
       'onehotencoder__k_group_slightly_oversold']]

X_test_Dummies = test_df[['onehotencoder__Time_02:00:00', 'onehotencoder__Time_05:00:00',
       'onehotencoder__Time_06:00:00', 'onehotencoder__Time_09:00:00',
       'onehotencoder__Time_10:00:00', 'onehotencoder__Time_13:00:00',
       'onehotencoder__Time_14:00:00', 'onehotencoder__Time_17:00:00',
       'onehotencoder__Time_18:00:00', 'onehotencoder__Time_21:00:00',
       'onehotencoder__Time_22:00:00', 'onehotencoder__Trend Status_Bullish',
       'onehotencoder__5075 Trend Status_Bullish',
       'onehotencoder__75100 Trend Status_Bullish',
       'onehotencoder__100125 Trend Status_Bullish',
       'onehotencoder__Opportunity_Unfavorable',
       'onehotencoder__Order Type_SELL', 'onehotencoder__k_group_Oversold',
       'onehotencoder__k_group_Undersold',
       'onehotencoder__k_group_slightly_oversold']]




In [None]:
X_train_Dummies.shape, X_test_Dummies.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
#Create Linear Regresion Model
Lin_Reg_Dummy_model = LinearRegression().fit(X_train_Dummies, y_train)
Lin_Reg_Dummy_Prediction = Lin_Reg_Dummy_model.predict(X_test_Dummies)

mean_absolute_error(y_test, Lin_Reg_Dummy_Prediction)

## Binning (Grouping Aggregating )

In [None]:
train_df['remainder__2D High in Pips'].hist()

### Cluster 

In [None]:
plt.scatter(df['Trade_Week_Year'], df['volume'])

In [None]:
from sklearn.cluster import KMeans

X_Train_Trade_Week_Year_Volume = train_df[['remainder__Trade_Week_Year','remainder__volume']]

KMeans = KMeans(n_clusters=7).fit(X_Train_Trade_Week_Year_Volume)

KMeans.labels_

In [None]:
import plotly.express as px

In [None]:

import plotly.io as pio
pio.renderers.default = "notebook_connected"


px.scatter(x=train_df['remainder__Trade_Week_Year'], y=train_df['remainder__volume'], color=KMeans.labels_)

In [None]:
X_train_Cluster = pd.get_dummies(pd.Series(KMeans.labels_)).astype(int).to_numpy()


#Predict the clusters for the Test date 
X_Test_Trade_Week_Year_Volume = test_df[['remainder__Trade_Week_Year','remainder__volume']].to_numpy()
X_Test_Cluster = pd.get_dummies(pd.Series(KMeans.predict(X_Test_Trade_Week_Year_Volume))).astype(int).to_numpy()


Lin_Reg_Cluster  = LinearRegression().fit(X_train_Cluster, y_train)
Lin_Reg_Cluster_Predictions = Lin_Reg_Cluster.predict(X_Test_Cluster)

mean_absolute_error(y_test, Lin_Reg_Cluster_Predictions)

In [None]:
import pandas as pd

# Assume n_clusters is the number of clusters you used
n_clusters = max(KMeans.labels_.max(), KMeans.predict(X_Test_Trade_Week_Year_Volume).max()) + 1
all_possible_clusters = pd.Series(range(n_clusters))

# For training data
X_train_Cluster = pd.get_dummies(pd.Categorical(pd.Series(KMeans.labels_), categories=all_possible_clusters)).astype(int).to_numpy()

# For test data
test_clusters = KMeans.predict(X_Test_Trade_Week_Year_Volume)
X_Test_Cluster = pd.get_dummies(pd.Categorical(pd.Series(test_clusters), categories=all_possible_clusters)).astype(int).to_numpy()


In [None]:
Lin_Reg_Cluster  = LinearRegression().fit(X_train_Cluster, y_train)
Lin_Reg_Cluster_Predictions = Lin_Reg_Cluster.predict(X_Test_Cluster)

mean_absolute_error(y_test, Lin_Reg_Cluster_Predictions)

In [None]:
# Plotting scatter plot
plt.scatter(y_test, Lin_Reg_Cluster_Predictions, alpha=0.5)
plt.title('Scatter Plot of Predictions vs Actuals')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
# Sample data - replace with your actual and predicted values
actual = np.array(y_test)  # Assuming y_test is your actual values
predicted = np.array(Lin_Reg_Cluster_Predictions)  # Your predicted values
errors = actual - predicted  # Calculate the errors

# Create the primary plot with scatter points
fig, ax1 = plt.subplots()

# Scatter plot for actual vs. predicted values
ax1.scatter(actual, predicted, alpha=0.5, color='blue', label='Predicted vs. Actual')
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Create the secondary y-axis for errors
ax2 = ax1.twinx()  
ax2.set_ylabel('Error', color='red')  
ax2.tick_params(axis='y', labelcolor='red')

# Adding vertical lines for each point to represent the error
for i in range(len(actual)):
    plt.vlines(actual[i], predicted[i], actual[i], color='red', alpha=0.7, linewidth=0.5)

# Optional: Add legend
ax1.legend()

plt.title('Random Forest Model Predictions vs Actuals with Error Visualization')
plt.show()

In [None]:
RF_Cluster  = RandomForestRegressor(n_estimators=100, max_depth=5).fit(X_train_Cluster, y_train)
RF_Cluster_Predictions = RF_Cluster.predict(X_Test_Cluster)

mean_absolute_error(y_test, RF_Cluster_Predictions)

### Feature Selection 

In [None]:
X_train_Cluster.shape, X_train_Dummies.shape

In [None]:
#Concatenante Training Arrays 

X_train_full = np.concatenate([X_train_Cluster, X_train_Dummies], axis=1)

X_train_full.shape

In [None]:
X_Test_Cluster.shape,  X_test_Dummies.shape

In [None]:
#Concatenante Tets Arrays 

X_test_full = np.concatenate([X_Test_Cluster,  X_test_Dummies], axis=1)

X_test_full.shape

In [None]:
RF_Model_Full = RandomForestRegressor(n_estimators=50, max_depth=5).fit(X_train_full, y_train)

RF_Model_Predictions = RF_Model_Full.predict(X_test_full)

mean_absolute_error(y_test, RF_Model_Predictions)

### Test Train 

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
Y = df_model[['remainder__2D High in Pips']]
X = df_model.drop('remainder__2D High in Pips', axis=1)

In [None]:
 #Split the data into training and test set 
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=.204, random_state=42, stratify=X['onehotencoder__Order Type_SELL'])

print(f"Training Set: {X_train.shape}\
        \nTest Set: {X_test.shape}")

# Baseline Model 

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
#Create a New Training DF based on the Split 
y_train = pd.DataFrame(y_train, index=X_train.index, columns=['remainder__2D High in Pips'])

full_df_train = pd.concat([X_train,y_train ],axis=1)

#Create a new Test DF based on the Split 
y_test = pd.DataFrame(y_test, index=X_test.index, columns=['remainder__2D High in Pips'])

full_df_test = pd.concat([X_test,y_test ],axis=1)


print(f"Base_Model_Training Set: {full_df_train.shape}\
        \nTBase_Model_Test Set: {full_df_test.shape}")


# train_df.fillna(0, inplace=True)
# test_df.fillna(0, inplace=True)


In [None]:
#Extract the target variable from the train and test date 
y_train = full_df_train['remainder__2D High in Pips'].to_numpy()
y_test = full_df_test['remainder__2D High in Pips'].to_numpy()

#Calculate the average 
average_2d_high_for_all_trades = full_df_train['remainder__2D High in Pips'].mean()

baseline_model_test_predictions =  [average_2d_high_for_all_trades] * len(y_test)

mean_absolute_error(baseline_model_test_predictions, y_test)

### 1. Dimensionality Reduction (PCA)

In [None]:
#Observe the correlation between 34 features 

full_df_train[['onehotencoder__Trend Status_Bullish',
 'onehotencoder__5075 Trend Status_Bullish',
 'onehotencoder__75100 Trend Status_Bullish',
 'onehotencoder__100125 Trend Status_Bullish',
 'onehotencoder__Opportunity_Unfavorable',
 'onehotencoder__Order Type_SELL',
 'onehotencoder__k_group_Oversold',
 'onehotencoder__k_group_Undersold',
 'onehotencoder__k_group_slightly_oversold',
 'remainder__open',
 'remainder__high',
 'remainder__low',
 'remainder__close',
 'remainder__volume',
 'remainder__Trade_Week_Year',
 'remainder__Trade_Week_Month',
 'remainder__Trade_Day_Week',
 'remainder__Day',
 'remainder__Month',
 'remainder__25EMA',
 'remainder__50EMA',
 'remainder__75EMA',
 'remainder__100EMA',
 'remainder__125EMA',
 'remainder__Spread',
 'remainder__%K',
 'remainder__%D',
 'remainder__ADX',
 'remainder__ADXR',
 'remainder__slowk',
 'remainder__slowd',
'remainder__candle_bullish_score',
 'remainder__candle_bearish_score']].corr()

In [None]:
X_train34 = full_df_train[['onehotencoder__Trend Status_Bullish',
 'onehotencoder__5075 Trend Status_Bullish',
 'onehotencoder__75100 Trend Status_Bullish',
 'onehotencoder__100125 Trend Status_Bullish',
 'onehotencoder__Opportunity_Unfavorable',
 'onehotencoder__Order Type_SELL',
 'onehotencoder__k_group_Oversold',
 'onehotencoder__k_group_Undersold',
 'onehotencoder__k_group_slightly_oversold',
 'remainder__open',
 'remainder__high',
 'remainder__low',
 'remainder__close',
 'remainder__volume',
 'remainder__Trade_Week_Year',
 'remainder__Trade_Week_Month',
 'remainder__Trade_Day_Week',
 'remainder__Day',
 'remainder__Month',
 'remainder__25EMA',
 'remainder__50EMA',
 'remainder__75EMA',
 'remainder__100EMA',
 'remainder__125EMA',
 'remainder__Spread',
 'remainder__%K',
 'remainder__%D',
 'remainder__ADX',
 'remainder__ADXR',
 'remainder__slowk',
 'remainder__slowd',
'remainder__candle_bullish_score',
 'remainder__candle_bearish_score']].to_numpy()
X_train34.shape

In [None]:
X_test34 = full_df_test[['onehotencoder__Trend Status_Bullish',
 'onehotencoder__5075 Trend Status_Bullish',
 'onehotencoder__75100 Trend Status_Bullish',
 'onehotencoder__100125 Trend Status_Bullish',
 'onehotencoder__Opportunity_Unfavorable',
 'onehotencoder__Order Type_SELL',
 'onehotencoder__k_group_Oversold',
 'onehotencoder__k_group_Undersold',
 'onehotencoder__k_group_slightly_oversold',
 'remainder__open',
 'remainder__high',
 'remainder__low',
 'remainder__close',
 'remainder__volume',
 'remainder__Trade_Week_Year',
 'remainder__Trade_Week_Month',
 'remainder__Trade_Day_Week',
 'remainder__Day',
 'remainder__Month',
 'remainder__25EMA',
 'remainder__50EMA',
 'remainder__75EMA',
 'remainder__100EMA',
 'remainder__125EMA',
 'remainder__Spread',
 'remainder__%K',
 'remainder__%D',
 'remainder__ADX',
 'remainder__ADXR',
 'remainder__slowk',
 'remainder__slowd',
'remainder__candle_bullish_score',
 'remainder__candle_bearish_score']].to_numpy()

X_test34.shape

In [None]:
#Get errr of RF model on (X_train3, y_train) and testing on X_test3 and y_test)

from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor(n_estimators=50, max_depth=5).fit(X_train34, y_train)
RF_predictions = RF_model.predict(X_test34)
mean_absolute_error(y_test, RF_predictions)

### 

In [None]:
y_train = train_df['remainder__2D High in Pips'].to_numpy()
y_test = test_df['remainder__2D High in Pips'].to_numpy()

average_2d_high_for_all_trades = train_df['remainder__2D High in Pips'].mean()

baseline_model_test_predictions =  [average_2d_high_for_all_trades] * len(y_test)

mean_absolute_error(baseline_model_test_predictions, y_test)

In [None]:
average_2d_high_for_all_trades = train_df['remainder__2D High in Pips'].mean()

baseline_model_test_predictions =  [average_2d_high_for_all_trades] * len(y_test)

mean_absolute_error(baseline_model_test_predictions, y_test)

In [None]:
X = np.array(ct.fit_transform(X))

In [None]:
??pd.concat

In [None]:
# test = ohe.fit_transform(df[['Trend Status','5075 Trend Status', '75100 Trend Status',
#        '100125 Trend Status', 'Action', 'Opportunity', 'Order Type']]).to_array()

# test.values

In [None]:
df.columns[20:40]

In [None]:
#Subset of features to keep
df= df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA','100EMA', '125EMA',
        'Trend Status', 'Spread','5075 Trend Status', '75100 Trend Status',
       '100125 Trend Status', 'Action', 'Opportunity', 'Order Type','2D Lowest Price',
       '2D Highest Price', '2D Low in Pips', '2D High in Pips',
       '2D Trade Status', '2D Hard Stop Price', '2D Hard Stop Loss',
       '2D Trend Change Stop Price', '2D Trend Change Stop Loss', '%K', '%D', 'k_group', 'ADX', 'ADXR', 'slowk',
       'slowd']]

In [None]:
#Apply one-hot encoding 
df= pd.get_dummies(df,columns=['Trend Status','5075 Trend Status', '75100 Trend Status',
       '100125 Trend Status', 'Action', 'Opportunity', 'Order Type','k_group'])

In [None]:
df.columns

In [None]:
X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)


## Linear Model 

### Linear Mode on All Data

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
#Create and fit linear regression model 
lin_model = LinearRegression().fit(X_train,Y_train)

#Make Predictions on test data
y_preds = lin_model.predict(X_test)

#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)

print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(Y_test, y_preds)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actuals vs. Predictions")
plt.grid(True)
plt.show()


In [None]:
# Feature Importance Analysis 
coef = lin_model.coef_
print('Feature Importance:', coef)

In [None]:
lin_model.score(X_test,Y_test)

In [None]:
#Perform k-folk cross validation 
sell_trade_high_scores = cross_val_score(LinearRegression(),X, Y,cv=5, scoring ='neg_mean_absolute_error' )

print('Cross-validation scores:', -sell_trade_high_scores)
print('Average MAE:', -sell_trade_high_scores.mean())

### Linear Model on Buy Trades HIGH 

In [None]:
#Create a subset of df with Buy Trades 
buy_df =  df[df['Order Type_BUY'] == True]

buy_df.shape

In [None]:
#Visualize the high of buy trades 
fig = go.Figure(data=go.Scatter(
    x=buy_df['close'],  # Access the 'x' column
    y=buy_df['2D High in Pips'],  # Access the 'y' column
    mode='markers',
#     marker=dict(
#         size=10,
#         color=colors,  # Map colors from the 'color_column'
#         opacity=0.8
#     )
))

# Customize the layout
fig.update_layout(
    title="Buy Trades 2D High Price",
    xaxis_title="Closed Price at Trade Open",
    yaxis_title="2D High in Pips",
#     grid=True
)
fig.show()

In [None]:
buy_trade_high_mean = buy_df['2D High in Pips'].mean()
buy_trade_high_mean

In [None]:
#Seperate features and label 
#Indepedent Variables 
X = buy_df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = buy_df['2D High in Pips'] 

X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [None]:
#Create and fit linear regression model 
lin_model = LinearRegression().fit(X_train,Y_train)

#Create and fit linear regression model 
lin_model = LinearRegression().fit(X_train,Y_train)

#Make Predictions on test data
y_preds = lin_model.predict(X_test)

#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)

print('Mean', buy_trade_high_mean)
print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)

### Linear Model on SELL Trades HIGH 

In [None]:
#Create a subset of df with Buy Trades 
sell_df =  df[df['Order Type_SELL'] == True]

sell_df.shape

In [None]:
#Visualize the high of sell trades 
fig = go.Figure(data=go.Scatter(
    x=sell_df['close'],  # Access the 'x' column
    y=sell_df['2D High in Pips'],  # Access the 'y' column
    mode='markers',
#     marker=dict(
#         size=10,
#         color=colors,  # Map colors from the 'color_column'
#         opacity=0.8
#     )
))

# Customize the layout
fig.update_layout(
    title="Buy Trades 2D High Price",
    xaxis_title="Closed Price at Trade Open",
    yaxis_title="2D High in Pips",
#     grid=True
)
fig.show()

In [None]:
sell_trade_high_mean = sell_df['2D High in Pips'].mean()
sell_trade_high_mean

In [None]:
#Seperate features and label 
#Indepedent Variables 
X = sell_df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = sell_df['2D High in Pips'] 

X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

In [None]:
#Create and fit linear regression model 
lin_model = LinearRegression().fit(X_train,Y_train)

#Create and fit linear regression model 
lin_model = LinearRegression().fit(X_train,Y_train)

#Make Predictions on test data
y_preds = lin_model.predict(X_test)

#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)

print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(Y_test, y_preds)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actuals vs. Predictions")
plt.grid(True)
plt.show()


In [None]:
#Perform k-folk cross validation 
sell_trade_high_scores = cross_val_score(LinearRegression(),X, Y,cv=5, scoring ='neg_mean_absolute_error' )

print('Cross-validation scores:', -sell_trade_high_scores)
print('Average MAE:', -sell_trade_high_scores.mean())

## Gradient Boosting Model 

### Gradient Boost Model on all Data

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#Indepedent Variables 
X = df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable', 'Order Type_BUY',
       'Order Type_SELL','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = df['2D High in Pips'] 

In [None]:
X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

gbr = GradientBoostingRegressor(n_estimators=5,learning_rate= 0.1, random_state=0).fit(X_train,Y_train)

y_preds =gbr.predict(X_test)
    
#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)
gbr_score = gbr.score(X_test, Y_test)

print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)
print('Score', gbr_score)

In [None]:
#Perform k-folk cross validation 
all_trade_high_scores = cross_val_score(GradientBoostingRegressor(n_estimators=5,learning_rate= 0.1, random_state=0),X, Y,cv=5, scoring ='neg_mean_absolute_error' )

print('Cross-validation scores:', -all_trade_high_scores)
print('Average MAE:', -all_trade_high_scores.mean())

In [None]:
plt.scatter(Y_test, y_preds)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actuals vs. Predictions")
plt.grid(True)
plt.show()

### Gradient Boosting Model on Buy Trades HIGH 

In [None]:
#Seperate features and label 
#Indepedent Variables 
X = buy_df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = buy_df['2D High in Pips'] 

X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [None]:
buy_trade_high_mean

In [None]:
gbr = GradientBoostingRegressor(n_estimators=100,learning_rate= 0.1, random_state=0).fit(X_train,Y_train)

y_preds =gbr.predict(X_test)
    
#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)
gbr_score = gbr.score(X_test, Y_test)

print('Mean', buy_trade_high_mean)
print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)
print('Score', gbr_score)

In [None]:
#Perform k-folk cross validation 
all_trade_high_scores = cross_val_score(GradientBoostingRegressor(n_estimators=5,learning_rate= 0.1, random_state=0),X, Y,cv=5, scoring ='neg_mean_absolute_error' )

print('Cross-validation scores:', -all_trade_high_scores)
print('Average MAE:', -all_trade_high_scores.mean())

In [None]:
plt.scatter(Y_test, y_preds)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actuals vs. Predictions")
plt.grid(True)
plt.show()

### Gradient Boosting Model on SELL Trades HIGH 

In [None]:
#Seperate features and label 
#Indepedent Variables 
X = sell_df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = sell_df['2D High in Pips'] 

X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

In [None]:
gbr = GradientBoostingRegressor(n_estimators=100,learning_rate= 0.1, random_state=0).fit(X_train,Y_train)

y_preds =gbr.predict(X_test)
    
#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)
gbr_score = gbr.score(X_test, Y_test)

print('Mean', buy_trade_high_mean)
print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)
print('Score', gbr_score)

In [None]:
#Perform k-folk cross validation 
sell_trade_high_scores = cross_val_score(GradientBoostingRegressor(n_estimators=5,learning_rate= 0.1, random_state=0),X, Y,cv=5, scoring ='neg_mean_absolute_error' )

print('Cross-validation scores:', -sell_trade_high_scores)
print('Average MAE:', -sell_trade_high_scores.mean())

In [None]:
plt.scatter(Y_test, y_preds)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actuals vs. Predictions")
plt.grid(True)
plt.show()

## Bagging & Gradient Boosting 

### Bagging & Gradient Boosting All Trades

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
#Indepedent Variables 
X = df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable', 'Order Type_BUY',
       'Order Type_SELL','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = df['2D High in Pips'] 

X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)


In [None]:
basic_bag_model = BaggingRegressor(
    estimator=GradientBoostingRegressor(n_estimators=5,learning_rate= 0.1, random_state=0),
    n_estimators=100,
    max_samples=.8,
    oob_score=True,
    random_state=0
)

basic_bag_model.fit(X_train,Y_train)

#fix variable name 
score = basic_bag_model.oob_score_

y_preds = basic_bag_model.predict(X_test)


#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)
# gbr_score = gbr.score(X_test, Y_test)

print('Mean', buy_trade_high_mean)
print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)
print('Score', score)

### Bagging & Gradient Boosting BUY Trades


In [None]:
#Seperate features and label 
#Indepedent Variables 
X = buy_df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = buy_df['2D High in Pips'] 

X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [None]:
basic_bag_model = BaggingRegressor(
    estimator=GradientBoostingRegressor(n_estimators=100,learning_rate= 0.1, random_state=0),
    n_estimators=100,
    max_samples=.8,
    oob_score=True,
    random_state=0
)

basic_bag_model.fit(X_train,Y_train)

#fix variable name 
score = basic_bag_model.oob_score_

y_preds = basic_bag_model.predict(X_test)


#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)
# gbr_score = gbr.score(X_test, Y_test)

print('Mean', buy_trade_high_mean)
print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)
print('Score', score)

### Bagging & Gradient Boosting SELL Trades


In [None]:
#Seperate features and label 
#Indepedent Variables 
X = sell_df[['open', 'high', 'low', 'close', '25EMA', '50EMA', '75EMA', '100EMA',
       '125EMA', 'Spread','Trend Status_Bearish',
       'Trend Status_Bullish', '5075 Trend Status_Bearish',
       '5075 Trend Status_Bullish', '75100 Trend Status_Bearish',
       '75100 Trend Status_Bullish', '100125 Trend Status_Bearish',
       '100125 Trend Status_Bullish', 'Action_Ultimate Action',
       'Opportunity_Optimal', 'Opportunity_Unfavorable','k_group_Oversold',
       'k_group_Undersold', 'k_group_slightly_oversold','%K', '%D', 'ADX', 'ADXR', 'slowk', 'slowd']]
#Dependent Variable
Y = sell_df['2D High in Pips'] 

X_train,X_test,Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

In [None]:
basic_bag_model = BaggingRegressor(
    estimator=GradientBoostingRegressor(n_estimators=100,learning_rate= 0.2, random_state=0),
    n_estimators=100,
    max_samples=.8,
    oob_score=True,
    random_state=0
)

basic_bag_model.fit(X_train,Y_train)

#fix variable name 
score = basic_bag_model.oob_score_

y_preds = basic_bag_model.predict(X_test)


#Evaluate Model Performance 
mae = mean_absolute_error(Y_test, y_preds)
mse = mean_squared_error(Y_test, y_preds)
rmse = mean_squared_error(Y_test, y_preds, squared=False)
# gbr_score = gbr.score(X_test, Y_test)

print('Mean', sell_trade_high_mean)
print( 'MAE:', mae)
print( 'MSE:', mse)
print( 'RMSE:', rmse)
print('Score', score)