In [None]:
import glob
import polars as pl
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
import joblib
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

def polars_to_pandas(polars_df):
    """Convert a Polars DataFrame to a Pandas DataFrame."""
    return polars_df.to_pandas()

def optimize_memory(df):
    """Downcast numerical columns to reduce memory usage."""
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def plot_feature_importance(importance_df, title='Feature Importance'):
    """Plot and save feature importance."""
    plt.figure(figsize=(10, 6))
    plt.bar(importance_df['Feature'], importance_df['Importance'], color='skyblue')
    plt.title(title)
    plt.xlabel('Features')
    plt.ylabel('Importance (Gain)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plot_filename = f"{title.replace(' ', '_').lower()}.png"
    plt.savefig(plot_filename)
    plt.close()
    return plot_filename

class LoadData:
    def __init__(self, file_paths, partition_ids=None):
        """
        Initialize the LoadData class.

        :param file_paths: List of all file paths.
        :param partition_ids: List of partition IDs to load. If None, load all.
        """
        self.file_paths = file_paths
        self.partition_ids = partition_ids

    def load_and_concat(self):
        """Load and concatenate Polars DataFrames from specified file paths."""
        if self.partition_ids is not None:
            # Filter file paths to include only specified partitions
            selected_files = [
                fp for fp in self.file_paths
                if any(f'partition_id={pid}' in fp for pid in self.partition_ids)
            ]
        else:
            selected_files = self.file_paths

        # Load each Parquet file into a Polars DataFrame
        partitioned_data = [pl.read_parquet(file_path) for file_path in selected_files]
        
        # Concatenate all DataFrames into one
        df = pl.concat(partitioned_data, rechunk=False)
        
        # Delete individual DataFrames to free memory
        del partitioned_data
        gc.collect()
        
        return df

# Specify partition IDs to load (6, 7, 8, 9)
partition_ids = [6, 7, 8, 9]

# Get all file paths sorted (adjust the glob pattern as needed)
file_paths_all = sorted(glob.glob('Data/train.parquet/*/*.parquet'))

# Initialize the loader with selected partitions
loader = LoadData(file_paths=file_paths_all, partition_ids=partition_ids)

# Load and concatenate the selected partitions
df_selected = loader.load_and_concat()

# Free up memory by deleting the loader and file paths
del loader, file_paths_all
gc.collect()

# Convert to Pandas for easier manipulation
df_selected_pd = polars_to_pandas(df_selected)

# Free up memory by deleting the Polars DataFrame
del df_selected
gc.collect()

# Sort by 'date_id' ascending, then by 'time_id' ascending
df_selected_pd.sort_values(['date_id', 'time_id'], inplace=True)
df_selected_pd.reset_index(drop=True, inplace=True)

# Invoke garbage collection
gc.collect()

# Identify unique date_ids sorted in ascending order
unique_date_ids_sorted = np.sort(df_selected_pd['date_id'].unique())

# Total number of unique days
total_unique_days = len(unique_date_ids_sorted)
print(f"Total unique days in selected partitions: {total_unique_days}")

# Define the number of validation days
validation_days = 100

# Determine the cutoff index
cutoff_index = total_unique_days - validation_days

# Get the cutoff date_id
validation_start_date_id = unique_date_ids_sorted[cutoff_index]
print(f"Validation will start from date_id: {validation_start_date_id}")

# Split into training and validation sets based on date_id
train_df = df_selected_pd[df_selected_pd['date_id'] < validation_start_date_id].copy()
validation_df = df_selected_pd[df_selected_pd['date_id'] >= validation_start_date_id].copy()

# Free up memory by deleting the sorted DataFrame
del df_selected_pd
gc.collect()

# Define non-informative features (modify based on your dataset)
excluded_features = [col for col in train_df.columns if col.startswith('responder_')] + ['weight']

# Define feature columns
feature_cols = [col for col in train_df.columns if col not in excluded_features]

print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Optimize memory by downcasting
train_df = optimize_memory(train_df)
validation_df = optimize_memory(validation_df)

# Training set
X_train = train_df[feature_cols].astype(np.float32)
y_train = train_df['responder_6'].astype(np.float32)
w_train = train_df['weight'].astype(np.float32)

# Validation set
X_val = validation_df[feature_cols].astype(np.float32)
y_val = validation_df['responder_6'].astype(np.float32)
w_val = validation_df['weight'].astype(np.float32)

# Free up memory by deleting the DataFrames
del train_df, validation_df
gc.collect()

# Create DMatrix for training and validation
dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
dval = xgb.DMatrix(X_val, label=y_val, weight=w_val)

# Free up memory by deleting the Pandas DataFrames
gc.collect()

def weighted_r2_metric(preds, dtrain):
    """Custom weighted R² evaluation metric for XGBoost."""
    y_true = dtrain.get_label()
    w = dtrain.get_weight()
    numerator = np.sum(w * (y_true - preds) ** 2)
    denominator = np.sum(w * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return 'weighted_r2', r2

# Best hyperparameters from your grid search
best_params = {
    'max_depth': 4,
    'learning_rate': 0.09996,
    'subsample': 0.70679,
    'colsample_bytree': 0.73263,
    'gamma': 0.26034,
    'min_child_weight': 3,
    'reg_alpha': 0.43891,
    'reg_lambda': 0.59725,
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse',  # Base evaluation metric
    'n_jobs': -1,
    'verbosity': 1,
    'seed': 42
}

In [None]:
# Set up MLflow
mlflow.set_tracking_uri("http://localhost:5000")  # Update if different
mlflow.set_experiment('Jane Street Forecasting Custom R2 Validation 6')

# Disable automatic MLflow logging to customize
mlflow.xgboost.autolog(disable=True)

with mlflow.start_run(run_name='Final Training with Last 4 Partitions'):

    # Log hyperparameters
    mlflow.log_params(best_params)
    
    # Define evaluation sets
    evals = [(dtrain, 'train'), (dval, 'validation')]
    
    # Train the model
    model = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=10,
        custom_metric=weighted_r2_metric,
        maximize=True,
        verbose_eval=10
    )
    
    # Log the best iteration
    mlflow.log_metric('best_iteration', model.best_iteration)
    
    # Log custom metrics
    y_pred_val = model.predict(dval)
    weighted_r2_val = 1 - (np.sum(w_val * (y_val - y_pred_val) ** 2) / np.sum(w_val * y_val ** 2))
    mlflow.log_metric('validation_weighted_r2', weighted_r2_val)
    
    y_pred_train = model.predict(dtrain)
    weighted_r2_train = 1 - (np.sum(w_train * (y_train - y_pred_train) ** 2) / np.sum(w_train * y_train ** 2))
    mlflow.log_metric('training_weighted_r2', weighted_r2_train)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val, sample_weight=w_val))
    mlflow.log_metric('validation_rmse', rmse_val)
    
    # Optionally save and log the model
    joblib.dump(model, 'final_model.joblib')
    mlflow.log_artifact('final_model.joblib')
    
    # Clean up
    del dtrain, dval, model, y_pred_val, y_pred_train, weighted_r2_val, weighted_r2_train, rmse_val
    gc.collect()

### Training the Final Model:

In [None]:
import glob
import polars as pl
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
import joblib
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

def polars_to_pandas(polars_df):
    """Convert a Polars DataFrame to a Pandas DataFrame."""
    return polars_df.to_pandas()

def optimize_memory(df):
    """Downcast numerical columns to reduce memory usage."""
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def plot_feature_importance(importance_df, title='Feature Importance'):
    """Plot and save feature importance."""
    plt.figure(figsize=(10, 6))
    plt.bar(importance_df['Feature'], importance_df['Importance'], color='skyblue')
    plt.title(title)
    plt.xlabel('Features')
    plt.ylabel('Importance (Gain)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plot_filename = f"{title.replace(' ', '_').lower()}.png"
    plt.savefig(plot_filename)
    plt.close()
    return plot_filename

class LoadData:
    def __init__(self, file_paths, partition_ids=None):
        """
        Initialize the LoadData class.

        :param file_paths: List of all file paths.
        :param partition_ids: List of partition IDs to load. If None, load all.
        """
        self.file_paths = file_paths
        self.partition_ids = partition_ids

    def load_and_concat(self):
        """Load and concatenate Polars DataFrames from specified file paths."""
        if self.partition_ids is not None:
            # Filter file paths to include only specified partitions
            selected_files = [
                fp for fp in self.file_paths
                if any(f'partition_id={pid}' in fp for pid in self.partition_ids)
            ]
        else:
            selected_files = self.file_paths

        # Load each Parquet file into a Polars DataFrame
        partitioned_data = [pl.read_parquet(file_path) for file_path in selected_files]
        
        # Concatenate all DataFrames into one
        df = pl.concat(partitioned_data, rechunk=False)
        
        # Delete individual DataFrames to free memory
        del partitioned_data
        gc.collect()
        
        return df

# Specify partition IDs to load (6, 7, 8, 9)
partition_ids = [6, 7, 8, 9]

# Get all file paths sorted (adjust the glob pattern as needed)
file_paths_all = sorted(glob.glob('Data/train.parquet/*/*.parquet'))

# Initialize the loader with selected partitions
loader = LoadData(file_paths=file_paths_all, partition_ids=partition_ids)

# Load and concatenate the selected partitions
df_selected = loader.load_and_concat()

# Free up memory by deleting the loader and file paths
del loader, file_paths_all
gc.collect()

# Convert to Pandas for easier manipulation
df_selected_pd = polars_to_pandas(df_selected)

# Free up memory by deleting the Polars DataFrame
del df_selected
gc.collect()

# Sort by 'date_id' ascending, then by 'time_id' ascending
df_selected_pd.sort_values(['date_id', 'time_id'], inplace=True)
df_selected_pd.reset_index(drop=True, inplace=True)

# Invoke garbage collection
gc.collect()

# Define non-informative features (modify based on your dataset)
excluded_features = [col for col in df_selected_pd.columns if col.startswith('responder_')] + ['weight']

# Define feature columns
feature_cols = [col for col in df_selected_pd.columns if col not in excluded_features]

print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Optimize memory by downcasting
df_selected_pd = optimize_memory(df_selected_pd)

# Extract features, target, and weights
X = df_selected_pd[feature_cols].astype(np.float32)
y = df_selected_pd['responder_6'].astype(np.float32)
w = df_selected_pd['weight'].astype(np.float32)

# Free up memory by deleting the DataFrame
del df_selected_pd
gc.collect()

# Create DMatrix for training
dtrain = xgb.DMatrix(X, label=y, weight=w)

# Free up memory by deleting the Pandas DataFrames
del X, y, w
gc.collect()

def weighted_r2_metric(preds, dtrain):
    """Custom weighted R² evaluation metric for XGBoost."""
    y_true = dtrain.get_label()
    w = dtrain.get_weight()
    numerator = np.sum(w * (y_true - preds) ** 2)
    denominator = np.sum(w * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return 'weighted_r2', r2

# Best hyperparameters from your grid search (remove 'n_estimators')
best_params = {
    'max_depth': 4,
    'learning_rate': 0.09996,
    'subsample': 0.70679,
    'colsample_bytree': 0.73263,
    'gamma': 0.26034,
    'min_child_weight': 3,
    'reg_alpha': 0.43891,
    'reg_lambda': 0.59725,
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse',  # Base evaluation metric
    'n_jobs': -1,
    'verbosity': 1,
    'seed': 42
}

# Set up MLflow
mlflow.set_tracking_uri("http://localhost:5000")  # Update if different
mlflow.set_experiment('Jane Street Forecasting Final Model 3')

# Disable automatic MLflow logging to customize
mlflow.xgboost.autolog(disable=True)

# Debugging: Check if necessary variables are defined
print(f"dtrain defined: {'dtrain' in locals() or 'dtrain' in globals()}")

with mlflow.start_run(run_name='Final Model Training on All Data'):
    
    # Log hyperparameters (excluding 'n_estimators')
    mlflow.log_params(best_params)
    
    # Train the final model without validation
    model = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=75,  # Set to best_iteration found earlier
        custom_metric=weighted_r2_metric,  # Custom metric
        maximize=True,  # Because higher R² is better
        verbose_eval=10
    )
    
    # Log the best iteration (since we set num_boost_round, it's fixed)
    mlflow.log_metric('best_iteration', 75)
    
    # Save and log the model using joblib
    joblib.dump(model, 'final_model.joblib')
    mlflow.log_artifact('final_model.joblib')
    
    # Extract and log feature importance
    importance = model.get_score(importance_type='gain')
    importance_df = pd.DataFrame({
        'Feature': list(importance.keys()),
        'Importance': list(importance.values())
    }).sort_values(by='Importance', ascending=False)
    
    # Optional: Log all feature importances or limit to top N
    top_n = 15
    importance_df_top = importance_df.head(top_n)
    
    # Save feature importance to a CSV file
    importance_df_top.to_csv('feature_importance.csv', index=False)
    mlflow.log_artifact('feature_importance.csv')
    
    # Plot and log feature importance
    plot_filename = plot_feature_importance(importance_df_top, title='Feature Importance - Final Model')
    mlflow.log_artifact(plot_filename, artifact_path='feature_importance_plots')
    
    # Compute and log the custom weighted R² on the entire training set
    y_pred = model.predict(dtrain)
    # To compute weighted R², you need to access the original labels and weights
    # Since we deleted X, y, w earlier, we need to retrieve them from dtrain
    y_true = dtrain.get_label()
    w = dtrain.get_weight()
    weighted_r2 = 1 - (np.sum(w * (y_true - y_pred) ** 2) / np.sum(w * y_true ** 2))
    print(f"Training Weighted R²: {weighted_r2:.4f}")
    mlflow.log_metric('training_weighted_r2', weighted_r2)
    
    # Compute RMSE for training set
    rmse = np.sqrt(np.average((y_true - y_pred) ** 2, weights=w))
    print(f"Training RMSE: {rmse:.4f}")
    mlflow.log_metric('training_rmse', rmse)
    
    # Residual Analysis
    residuals = y_true - y_pred
    plt.figure(figsize=(10, 6))
    plt.scatter(y_pred, residuals, alpha=0.3)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs. Predicted Values - Training Set')
    plt.tight_layout()
    residual_plot = 'residuals_training.png'
    plt.savefig(residual_plot)
    plt.close()
    mlflow.log_artifact(residual_plot, artifact_path='residual_plots')
    
    # Optional: Save feature importance plot in MLflow
    # Already done above
    gc.collect()

In [None]:
model.save_model('customr2_model.json')