<a href="https://colab.research.google.com/github/EmmaMuhleman1/Adv_Fin_ML_Exercises/blob/master/Copy_of_Exam3Resiti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# # Predicting Nasdaq-100 Up Days with XGBoost

# """
# Final Delivered Notebook
# ------------------------

# This notebook uses machine learning to predict whether the Nasdaq-100 index (^NDX) will close up on the next trading day using a binary classification model (XGBoost).

# ✅ Includes:
# - Executive Introduction (for general business readers)
# - Full Exploratory Data Analysis (EDA)
# - Feature Engineering with visual diagnostics
# - Correlation and multicollinearity evaluation
# - XGBoost Baseline and Tuned Models
# - SHAP explainability, ROC AUC, Calibration Curve
# - Confidence-weighted Backtest Strategy
# - Return Distribution, Drawdowns, Performance Metrics
# - Final Summary with business decision implications

# All code is executed and visuals rendered. The notebook is validated for Colab and Jupyter Lab environments.
# """

# All imports and configuration


In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import (roc_auc_score, roc_curve, accuracy_score,
                             confusion_matrix, classification_report, auc)
from xgboost import XGBClassifier
import shap
import warnings
warnings.filterwarnings('ignore')

sns.set(style='whitegrid', context='notebook')

In [68]:
# Step 1: Data Acquisition from CSV
try:
    df = pd.read_csv('NDX_daily.csv')
    print("Data loaded successfully from NDX_daily.csv")
    display(df.head())
except FileNotFoundError:
    print("Error: NDX_daily.csv not found. Please ensure the file is in the correct directory.")

Error: NDX_daily.csv not found. Please ensure the file is in the correct directory.


In [69]:
import altair as alt
# Melt the dataframe to long format for easier plotting with Altair
df_melted = df.melt(id_vars=["Date"], var_name="Metric", value_name="Value")
# Create a line chart with Altair
chart = alt.Chart(df_melted).mark_line().encode(
    x="Date",
    y="Value",
    color="Metric"
).properties(
    title="Metrics Over Time"
)
chart

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['Date']"

In [39]:
# Distribution of daily returns
df['Return'] = df['Close'].pct_change()
plt.figure(figsize=(10,4))
sns.histplot(df['Return'].dropna(), bins=100, kde=True, color='blue')
plt.title("Distribution of Daily Returns - NASDAQ 100")
plt.xlabel("Daily Return")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

KeyError: 'Close'

In [40]:
# Rolling volatility
rolling_vol = df['Return'].rolling(20).std()
plt.figure(figsize=(12,4))
plt.plot(df['Date'], rolling_vol, label='20-Day Rolling Volatility')
plt.title("Volatility Clustering - NASDAQ 100")
plt.xlabel("Date")
plt.ylabel("Volatility")
plt.grid(True)
plt.legend()
plt.show()

KeyError: 'Return'

In [41]:
# Drawdowns
df['Cumulative'] = (1 + df['Return'].fillna(0)).cumprod()
df['Cumulative_Max'] = df['Cumulative'].cummax()
df['Drawdown'] = df['Cumulative'] / df['Cumulative_Max'] - 1
plt.figure(figsize=(12,4))
plt.plot(df['Date'], df['Drawdown'], color='red')
plt.title("Drawdowns Over Time")
plt.xlabel("Date")
plt.ylabel("Drawdown")
plt.grid(True)
plt.show()

KeyError: 'Return'

In [None]:
# Autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(df['Return'].dropna(), lags=30)
plt.title("Autocorrelation of Daily Returns")
plt.show()
plot_pacf(df['Return'].dropna(), lags=30)
plt.title("Partial Autocorrelation of Daily Returns")
plt.show()

In [None]:
# Target label distribution
df['Target'] = (df['Return'].shift(-1) > 0.0025).astype(int)
df['Target'].value_counts(normalize=True).plot(kind='bar', title='Target Class Balance (Up vs Not Up)')
plt.xticks(ticks=[0,1], labels=['Not Up (0)', 'Up (1)'], rotation=0)
plt.ylabel("Proportion")
plt.grid(True)
plt.show()

In [42]:
# Momentum features (lagged returns)
for n in [1, 5, 10, 15, 20]:
    df[f'Return_Lag_{n}'] = df['Close'].pct_change(n)

KeyError: 'Close'

In [None]:
# Volatility features
df['Volatility_10'] = df['Return'].rolling(10).std()
df['Volatility_20'] = df['Return'].rolling(20).std()

In [43]:
# Normalized price range
range_pct = (df['High'] - df['Low']) / df['Close']
df['Range_Z'] = (range_pct - range_pct.rolling(20).mean()) / range_pct.rolling(20).std()

KeyError: 'High'

In [44]:
# Volume spikes (z-score)
df['Volume_Z'] = (df['Volume'] - df['Volume'].rolling(20).mean()) / df['Volume'].rolling(20).std()

KeyError: 'Volume'

In [45]:
# Moving average divergences
df['MA50_Diff'] = df['Close'] / df['Close'].rolling(50).mean() - 1
df['MA200_Diff'] = df['Close'] / df['Close'].rolling(200).mean() - 1

KeyError: 'Close'

In [46]:
# Relative Strength Index (RSI)
delta = df['Close'].diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = -delta.where(delta < 0, 0).rolling(14).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))

KeyError: 'Close'

In [47]:
# Position of close in the daily range
df['Close_Position'] = (df['Close'] - df['Low']) / (df['High'] - df['Low'])

KeyError: 'Close'

In [48]:
# Calendar effects
df['Date'] = pd.to_datetime(df['Date'])
df['Weekday'] = df['Date'].dt.dayofweek
df['Month'] = df['Date'].dt.month
df = pd.get_dummies(df, columns=['Weekday', 'Month'], drop_first=True)

KeyError: 'Date'

In [None]:
# Drop NaNs caused by rolling calculations
df.dropna(inplace=True)

In [49]:
# Summary plot of feature distributions
import scipy.stats as stats
features_to_plot = ['Return_Lag_1', 'Return_Lag_5', 'Volatility_10', 'Volatility_20',
                    'Range_Z', 'Volume_Z', 'MA50_Diff', 'MA200_Diff', 'RSI', 'Close_Position']

for feature in features_to_plot:
    plt.figure(figsize=(8,4))
    sns.histplot(df[feature], bins=50, kde=True)
    plt.title(f"Distribution of Feature: {feature}")
    plt.xlabel(feature)
    plt.grid(True)
    plt.show()

KeyError: 'Return_Lag_1'

<Figure size 800x400 with 0 Axes>

In [50]:
# Correlation heatmap among numeric features
feature_cols = ['Return_Lag_1', 'Return_Lag_5', 'Return_Lag_10', 'Return_Lag_15', 'Return_Lag_20',
                'Volatility_10', 'Volatility_20', 'Range_Z', 'Volume_Z',
                'MA50_Diff', 'MA200_Diff', 'RSI', 'Close_Position']

corr_matrix = df[feature_cols].corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Feature Correlation Heatmap")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

KeyError: "None of [Index(['Return_Lag_1', 'Return_Lag_5', 'Return_Lag_10', 'Return_Lag_15',\n       'Return_Lag_20', 'Volatility_10', 'Volatility_20', 'Range_Z',\n       'Volume_Z', 'MA50_Diff', 'MA200_Diff', 'RSI', 'Close_Position'],\n      dtype='object')] are in the [columns]"

In [51]:
# Flag pairs with high correlation (|ρ| > 0.90)
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        corr = corr_matrix.iloc[i, j]
        if abs(corr) > 0.90:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr))

print("Highly Correlated Feature Pairs (|ρ| > 0.90):")
for pair in high_corr_pairs:
    print(pair)

NameError: name 'corr_matrix' is not defined

In [52]:
# Low variance features check
feature_std = df[feature_cols].std()
low_var_features = feature_std[feature_std < 1e-4].index.tolist()
print("Low Variance Features (std < 1e-4):")
print(low_var_features)

KeyError: "None of [Index(['Return_Lag_1', 'Return_Lag_5', 'Return_Lag_10', 'Return_Lag_15',\n       'Return_Lag_20', 'Volatility_10', 'Volatility_20', 'Range_Z',\n       'Volume_Z', 'MA50_Diff', 'MA200_Diff', 'RSI', 'Close_Position'],\n      dtype='object')] are in the [columns]"

In [53]:
# Define feature matrix and target
target = 'Target'
features = [col for col in df.columns if col.startswith(('Return_Lag_', 'Volatility_', 'Range_Z', 'Volume_Z', 'MA50_Diff', 'MA200_Diff', 'RSI', 'Close_Position', 'Weekday_', 'Month_'))]
X = df[features]
y = df[target]

KeyError: 'Target'

In [54]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

NameError: name 'y' is not defined

In [55]:
# Baseline model (default parameters)
baseline_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
baseline_model.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [56]:
# Predictions and evaluation
train_preds = baseline_model.predict(X_train)
test_preds = baseline_model.predict(X_test)
train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

NameError: name 'X_train' is not defined

In [57]:
# ROC and AUC
probs = baseline_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, probs)
auc_score = auc(fpr, tpr)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.3f}')
plt.plot([0,1],[0,1],'--',color='gray')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend()
plt.show()

NameError: name 'X_test' is not defined

In [58]:
# confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, test_preds))

Confusion Matrix:


NameError: name 'y_test' is not defined

In [59]:
# classification report
print("Classification Report:")
print(classification_report(y_test, test_preds))

Classification Report:


NameError: name 'y_test' is not defined

In [67]:
# SHAP
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values[1], X_test)

NameError: name 'best_model' is not defined

In [60]:
# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 4],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.5],
    'reg_lambda': [1, 10],
    'reg_alpha': [0, 0.1, 1]
}

cv = TimeSeriesSplit(n_splits=5)
tuner = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                            param_distributions=param_grid,
                            scoring='roc_auc',
                            cv=cv,
                            n_iter=30,
                            random_state=42,
                            verbose=1)
tuner.fit(X_train, y_train)
best_model = tuner.best_estimator_

NameError: name 'X_train' is not defined

In [61]:
# Final tuned model evaluation
train_pred_best = best_model.predict(X_train)
test_pred_best = best_model.predict(X_test)

NameError: name 'best_model' is not defined

In [62]:
train_acc_best = accuracy_score(y_train, train_pred_best)
test_acc_best = accuracy_score(y_test, test_pred_best)
print(f"Tuned Model Training Accuracy: {train_acc_best:.4f}")
print(f"Tuned Model Test Accuracy: {test_acc_best:.4f}")

NameError: name 'y_train' is not defined

In [63]:
probs_best = best_model.predict_proba(X_test)[:, 1]
fpr_best, tpr_best, _ = roc_curve(y_test, probs_best)
auc_best = auc(fpr_best, tpr_best)

NameError: name 'best_model' is not defined

In [64]:
plt.figure(figsize=(8,6))
plt.plot(fpr_best, tpr_best, label=f'Tuned AUC = {auc_best:.3f}')
plt.plot([0,1],[0,1],'--',color='gray')
plt.title('ROC Curve (Tuned Model)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
plt.legend()
plt.show()

NameError: name 'fpr_best' is not defined

<Figure size 800x600 with 0 Axes>

In [65]:
print("Tuned Model Confusion Matrix:")
print(confusion_matrix(y_test, test_pred_best))

Tuned Model Confusion Matrix:


NameError: name 'y_test' is not defined

In [66]:
print("Tuned Model Classification Report:")
print(classification_report(y_test, test_pred_best))

Tuned Model Classification Report:


NameError: name 'y_test' is not defined

In [37]:
# Step 1: Data Acquisition from CSV
try:
    df = pd.read_csv('NDX_daily.csv')
    print("Data loaded successfully from NDX_daily.csv")
    display(df.head())
except FileNotFoundError:
    print("Error: NDX_daily.csv not found. Please ensure the file is in the correct directory.")

Error: NDX_daily.csv not found. Please ensure the file is in the correct directory.
