# 3. Modeling, evaluation and MVP dashboard

### Objectives
Create and train XGBoost model

### Inputs
`output/nb2_market_index_with_historical_and_features.feather`

### Outputs

### Notes/comments

---

In [None]:
# Import modules
import numpy as np
import pandas as pd
import panel as pn
import xgboost as xgb
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import hvplot.pandas


In [None]:
# Load dataset
preprocessed_df = pd.read_feather('../output/nb2_market_index_with_historical_and_features.feather')

In [None]:
# Prepare the DataFrame
model_df = preprocessed_df.dropna().copy().reset_index().rename(columns={"index": "Date"})
model_df["Date"] = pd.to_datetime(model_df["Date"])
model_df.tail(1)

In [None]:
# Define Features & Target
feature_cols = ["Has_Event", "SMA_4", "RSI_4", "MACD", "MACD_Signal", "BB_Width"]
X = model_df[feature_cols]
y = model_df["Sentiment_Label"]

In [None]:
# Time-based Train/Test Split
split_index = int(len(model_df) * 0.8)
train_df = model_df.iloc[:split_index]
test_df = model_df.iloc[split_index:]

X_train = train_df[feature_cols]
y_train = train_df["Sentiment_Label"]
X_test = test_df[feature_cols]
y_test = test_df["Sentiment_Label"]


In [None]:
# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Save scaler
joblib.dump(scaler, '../output/scaler.pkl')

In [None]:
# Train Model
model = xgb.XGBClassifier(
    objective="binary:logistic",
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
model.fit(X_train_scaled, y_train)


In [None]:
# Save model
joblib.dump(model, "../output/xgb_model.pkl")

In [None]:
# Predict on Test Set Only
y_pred = model.predict(X_test_scaled)


In [None]:
# Add Predictions to model_df (only for test part)
model_df["Predicted_Label"] = np.nan  # initialize with NaNs
model_df.loc[split_index:, "Predicted_Label"] = y_pred

In [None]:
# Predict probabilities (confidence that label is bullish)
y_proba = model.predict_proba(X_test_scaled)[:, 1]  # Probability of class 1 (bullish)


In [None]:
# Store in the test portion of model_df
model_df.loc[split_index:, "Confidence_Score"] = y_proba


In [None]:
# Mark correctness
model_df["Correct"] = np.nan
model_df.loc[split_index:, "Correct"] = (
    model_df.loc[split_index:, "Predicted_Label"] == model_df.loc[split_index:, "Sentiment_Label"]
)


In [None]:
# Accuracy & Classification Report
accuracy = accuracy_score(y_test, y_pred)
accuracy_text = f"""<span style="color:green; font-weight:bold; font-size:16px;">
✅ Model Accuracy (on unseen data): {accuracy:.2%}
</span>"""
report = classification_report(y_test, y_pred, target_names=["Bearish", "Bullish"])


In [None]:
# Predict next week
# Get the most recent week of features
# Check the date range set in notebook 1
X_next = preprocessed_df.dropna().copy().tail(1)[feature_cols]
X_next_scaled = scaler.transform(X_next)

# Predict next week
next_label = model.predict(X_next_scaled)[0]
next_proba = model.predict_proba(X_next_scaled)[0][1]  # Probability of bullish

# Format
label_str = "📈 Bullish" if next_label == 1 else "📉 Bearish"
confidence_str = f"{next_proba:.2%}"