In [16]:
# Re-import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Reload the dataset
file_path = "data/gspc_hourly_with_indicators.csv"
df = pd.read_csv(file_path, index_col=0, parse_dates=[0])

df['Date'] = df.index.date  # Extract date to group by day

# Compute previous day's High, Low, Close
daily_pivot_data = df.groupby('Date').agg({'High': 'max', 'Low': 'min', 'Close': 'last'}).shift(1)

# Merge back to hourly data, ensuring the pivot points remain constant throughout the day
df = df.merge(daily_pivot_data, left_on='Date', right_index=True, suffixes=('', '_prev_day'))

# Calculate Corrected Daily Pivot Points
df['Pivot'] = (df['High_prev_day'] + df['Low_prev_day'] + df['Close_prev_day']) / 3
df['S1'] = (2 * df['Pivot']) - df['High_prev_day']
df['R1'] = (2 * df['Pivot']) - df['Low_prev_day']
df['S2'] = df['Pivot'] - (df['High_prev_day'] - df['Low_prev_day'])
df['R2'] = df['Pivot'] + (df['High_prev_day'] - df['Low_prev_day'])

# Drop columns used for calculations to keep dataset clean
df.drop(columns=['High_prev_day', 'Low_prev_day', 'Close_prev_day'], inplace=True)

# Compute distances from pivot points
df['Dist_Pivot'] = df['Close'] - df['Pivot']
df['Dist_R1'] = df['Close'] - df['R1']

# Compute ATR (for normalization)
df['ATR_14'] = (df['High'] - df['Low']).rolling(window=14).mean()

# Normalize distances using ATR
df['Norm_Dist_Pivot'] = df['Dist_Pivot'] / df['ATR_14']
df['Norm_Dist_R1'] = df['Dist_R1'] / df['ATR_14']

# Compute 'Change', 'Slope', 'Acceleration'
df['Change'] = df['Close'].pct_change()
df['Slope'] = df['Close'].diff()
df['Acceleration'] = df['Slope'].diff()

# Drop NaN values resulting from shifting and rolling calculations
selected_features = ['Change', 'Slope', 'Acceleration', 'Norm_Dist_Pivot', 'Norm_Dist_R1']
df.dropna(subset=selected_features, inplace=True)

# Standardize the selected features for clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[selected_features])

# Apply KMeans clustering (choosing 4 clusters based on Elbow Method)
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42, n_init=10)
df['Market_Segment'] = kmeans.fit_predict(X_scaled)

# Display the updated dataset with clusters
print(df.head())


                             Close     High      Low     Open  Volume  \
Datetime                                                                
2023-02-22 19:00:00+00:00  3989.25  4025.00  3988.50  4016.75  367633   
2023-02-22 20:00:00+00:00  4000.00  4001.25  3983.75  3989.25  314003   
2023-02-23 08:00:00+00:00  4018.25  4023.25  4013.50  4014.50   28416   
2023-02-23 09:00:00+00:00  4015.25  4022.75  4009.25  4018.50   23686   
2023-02-23 10:00:00+00:00  4015.25  4016.50  4011.75  4015.50   13246   

                                 20_MA    20_STD       200_MA        50_MA  \
Datetime                                                                     
2023-02-22 19:00:00+00:00  4008.482143  6.988634  4008.482143  4008.482143   
2023-02-22 20:00:00+00:00  4007.916667  7.081582  4007.916667  4007.916667   
2023-02-23 08:00:00+00:00  4008.562500  7.312945  4008.562500  4008.562500   
2023-02-23 09:00:00+00:00  4008.955882  7.264122  4008.955882  4008.955882   
2023-02-23 10:00:00+

In [43]:
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_sample_weight
from imblearn.over_sampling import SMOTE
from bayes_opt import BayesianOptimization
import numpy as np

# Select features
selected_features = ['Change', 'Slope', 'Acceleration', 'Norm_Dist_Pivot', 'Norm_Dist_R1', 'Market_Segment']
X = df[selected_features]
y = df['Target']

# Time Series Split (Ensures No Data Leakage)
tscv = TimeSeriesSplit(n_splits=5)

# 🚀 Step 1: Bayesian Optimization Function
def xgb_evaluate(n_estimators, learning_rate, max_depth, gamma):
    model = XGBClassifier(n_estimators=int(n_estimators), 
                          learning_rate=learning_rate, 
                          max_depth=int(max_depth),
                          gamma=gamma,
                          random_state=42, 
                          eval_metric='mlogloss')
    
    accuracies, f1_scores = [], []
    
    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # 🚀 Apply SMOTE to balance the classes in training data
        smote = SMOTE(sampling_strategy='auto', random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        
        # Compute class weights
        sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_resampled)
        
        # Train model
        model.fit(X_train_resampled, y_train_resampled, sample_weight=sample_weights)
        
        # Predictions
        y_pred = model.predict(X_test)
        
        # Evaluate model
        accuracies.append(accuracy_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    
    # Return the average F1-score (Bayesian Optimization maximizes this)
    return np.mean(f1_scores)

# 🚀 Step 2: Run Bayesian Optimization
optimizer = BayesianOptimization(
    f=xgb_evaluate,
    pbounds={
        "n_estimators": (100, 300),  # Number of trees
        "learning_rate": (0.01, 0.3),  # Learning rate
        "max_depth": (3, 10),  # Tree depth
        "gamma": (0, 5)  # Regularization
    },
    random_state=42,
)

# Run Optimization (10 iterations)
optimizer.maximize(n_iter=10)

# 🚀 Step 3: Get the Best Parameters
best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])  # Convert to integer
best_params['max_depth'] = int(best_params['max_depth'])  # Convert to integer

print("🔎 Best Found Hyperparameters:", best_params)

# 🚀 Step 4: Train Model with Optimized Parameters
model = XGBClassifier(**best_params, random_state=42, eval_metric='mlogloss')

accuracies, f1_scores = [], []

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # 🚀 Apply SMOTE
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    # Compute class weights
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_resampled)
    
    # Train with best parameters
    model.fit(X_train_resampled, y_train_resampled, sample_weight=sample_weights)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracies.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

# 🚀 Step 5: Display Final Model Performance
model_performance = pd.DataFrame({'Metric': ['Accuracy', 'F1 Score'], 
                                  'Mean Value': [np.mean(accuracies), np.mean(f1_scores)]})

print(model_performance)


|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m0.4734   [39m | [39m1.873    [39m | [39m0.2857   [39m | [39m8.124    [39m | [39m219.7    [39m |
| [39m2        [39m | [39m0.4448   [39m | [39m0.7801   [39m | [39m0.05524  [39m | [39m3.407    [39m | [39m273.2    [39m |
| [39m3        [39m | [39m0.4009   [39m | [39m3.006    [39m | [39m0.2153   [39m | [39m3.144    [39m | [39m294.0    [39m |
| [39m4        [39m | [39m0.405    [39m | [39m4.162    [39m | [39m0.07158  [39m | [39m4.273    [39m | [39m136.7    [39m |
| [39m5        [39m | [39m0.4617   [39m | [39m1.521    [39m | [39m0.1622   [39m | [39m6.024    [39m | [39m158.2    [39m |
| [35m6        [39m | [35m0.492    [39m | [35m0.8442   [39m | [35m0.2483   [39m | [35m8.385    [39m | [35m219.3    [39m |
| [39m7        [39m | [39m0.4869   [39m | [

In [47]:
df["Target"].value_counts()

Target
1    4594
0    1088
2    1085
Name: count, dtype: int64