# Baseline Model Training

This notebook trains a baseline XGBoost model for predicting stock returns.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Add src directory to path
sys.path.append('../src')

# Import local modules
from models.data_loader import load_training_data, prepare_features, get_feature_label_split, train_test_split
from models.train_baseline import train_xgboost_model, evaluate_model, save_model

# Set up plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load and Prepare Data

First, let's load the data and prepare it for training.

In [None]:
# Define parameters
symbols = ['AAPL', 'MSFT', 'GOOGL']
start_date = '2025-04-01'
end_date = '2025-04-17'
label_horizon = 1  # Predict next period's return
label_type = 'return'  # 'return' or 'direction'

# Load data
df = load_training_data(
    start_date=start_date,
    end_date=end_date,
    symbols=symbols,
    use_feast=False,  # Set to True if using Feast
    label_horizon=label_horizon,
    label_type=label_type
)

In [None]:
# Display data summary
print(f"Loaded {len(df)} records")
print(f"Columns: {df.columns.tolist()}")
print(f"Symbols: {df['symbol'].unique().tolist()}")

if 'timestamp' in df.columns:
    print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

# Display first few rows
df.head()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:")
missing_values[missing_values > 0]

In [None]:
# Check label distribution
plt.figure(figsize=(10, 6))
plt.hist(df['label'].dropna(), bins=50)
plt.title('Label Distribution')
plt.xlabel('Return')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Print label statistics
print("Label statistics:")
print(df['label'].describe())

# Print percentage of positive returns
positive_returns = (df['label'] > 0).mean() * 100
print(f"Percentage of positive returns: {positive_returns:.2f}%")

## 2. Split Data into Train, Validation, and Test Sets

We'll use a time-based split to avoid look-ahead bias.

In [None]:
# Split data
data_splits = train_test_split(
    df,
    test_size=0.2,
    validation_size=0.1,
    shuffle=False  # Time-based split
)

train_df = data_splits['train']
val_df = data_splits['validation']
test_df = data_splits['test']

print(f"Train set: {len(train_df)} records")
print(f"Validation set: {len(val_df)} records")
print(f"Test set: {len(test_df)} records")

In [None]:
# Prepare features and labels
train_data = get_feature_label_split(train_df, label_column="label")
val_data = get_feature_label_split(val_df, label_column="label")
test_data = get_feature_label_split(test_df, label_column="label")

X_train, y_train = train_data["X"], train_data["y"]
X_val, y_val = val_data["X"], val_data["y"]
X_test, y_test = test_data["X"], test_data["y"]

print(f"Features: {X_train.columns.tolist()}")

## 3. Feature Analysis

Let's analyze the features to understand their distributions and relationships.

In [None]:
# Feature correlation with label
feature_corr = pd.DataFrame({
    'feature': X_train.columns,
    'correlation': [X_train[col].corr(y_train) for col in X_train.columns]
}).sort_values('correlation', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='correlation', y='feature', data=feature_corr)
plt.title('Feature Correlation with Label')
plt.xlabel('Correlation')
plt.ylabel('Feature')
plt.grid(True)
plt.show()

In [None]:
# Feature correlation matrix
plt.figure(figsize=(14, 12))
corr_matrix = X_train.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 4. Train XGBoost Model

Now, let's train an XGBoost model to predict stock returns.

In [None]:
# Set up XGBoost parameters
params = {
    "n_estimators": 200,
    "max_depth": 4,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42
}

# Train model
model, train_metrics = train_xgboost_model(
    X_train, y_train,
    X_val, y_val,
    params=params,
    use_wandb=False  # Set to True to log to Weights & Biases
)

In [None]:
# Plot feature importance
feature_importance = model.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importance
}).sort_values("Importance", ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.grid(True)
plt.show()

## 5. Evaluate Model on Test Set

Let's evaluate the model on the test set to see how well it generalizes.

In [None]:
# Evaluate on test set
test_metrics = evaluate_model(model, X_test, y_test, use_wandb=False)

In [None]:
# Plot actual vs predicted
preds = model.predict(X_test)
results_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": preds
})

plt.figure(figsize=(10, 8))
plt.scatter(results_df["Actual"], results_df["Predicted"], alpha=0.5)
plt.plot([-0.1, 0.1], [-0.1, 0.1], 'r--')
plt.title("Actual vs Predicted Returns")
plt.xlabel("Actual Return")
plt.ylabel("Predicted Return")
plt.grid(True)
plt.show()

In [None]:
# Analyze prediction errors
results_df["Error"] = results_df["Actual"] - results_df["Predicted"]

plt.figure(figsize=(10, 6))
plt.hist(results_df["Error"], bins=50)
plt.title("Prediction Error Distribution")
plt.xlabel("Error")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

# Print error statistics
print("Error statistics:")
print(results_df["Error"].describe())

In [None]:
# Analyze directional accuracy
results_df["Actual_Direction"] = (results_df["Actual"] > 0).astype(int)
results_df["Predicted_Direction"] = (results_df["Predicted"] > 0).astype(int)
results_df["Direction_Match"] = (results_df["Actual_Direction"] == results_df["Predicted_Direction"]).astype(int)

directional_accuracy = results_df["Direction_Match"].mean() * 100
print(f"Directional Accuracy: {directional_accuracy:.2f}%")

# Confusion matrix for directional prediction
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(results_df["Actual_Direction"], results_df["Predicted_Direction"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Down", "Up"])

plt.figure(figsize=(8, 8))
disp.plot(cmap="Blues")
plt.title("Confusion Matrix for Directional Prediction")
plt.show()

## 6. Save Model

Let's save the trained model for later use in backtesting.

In [None]:
# Create models directory if it doesn't exist
os.makedirs("../models", exist_ok=True)

# Save model
output_path = "../models/baseline_xgb.pkl"

# Combine metrics
all_metrics = {**train_metrics, **test_metrics}

# Prepare metadata
metadata = {
    "features": X_train.columns.tolist(),
    "metrics": all_metrics,
    "params": params,
    "data_info": {
        "n_train": len(X_train),
        "n_val": len(X_val),
        "n_test": len(X_test),
        "symbols": df["symbol"].unique().tolist(),
        "date_range": [df["timestamp"].min(), df["timestamp"].max()] if "timestamp" in df.columns else None
    }
}

save_model(model, output_path, metadata)
print(f"Model saved to {output_path}")

## 7. Summary and Next Steps

We've trained a baseline XGBoost model to predict stock returns. The model achieves a directional accuracy of around X% on the test set, which is better than random guessing (50%).

Next steps:
1. Run a backtest to evaluate the model's performance in a simulated trading environment
2. Experiment with different feature sets and model hyperparameters
3. Implement more sophisticated models (e.g., deep learning models)
4. Incorporate alternative data sources (e.g., sentiment analysis from news and social media)