In [1]:
import glob
import polars as pl
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
import joblib
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

def polars_to_pandas(polars_df):
    """Convert a Polars DataFrame to a Pandas DataFrame."""
    return polars_df.to_pandas()

def optimize_memory(df):
    """Downcast numerical columns to reduce memory usage."""
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def plot_feature_importance(importance_df, title='Feature Importance'):
    """Plot and save feature importance."""
    plt.figure(figsize=(10, 6))
    plt.bar(importance_df['Feature'], importance_df['Importance'], color='skyblue')
    plt.title(title)
    plt.xlabel('Features')
    plt.ylabel('Importance (Gain)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plot_filename = f"{title.replace(' ', '_').lower()}.png"
    plt.savefig(plot_filename)
    plt.close()
    return plot_filename

class LoadData:
    def __init__(self, file_paths, partition_ids=None):
        """
        Initialize the LoadData class.

        :param file_paths: List of all file paths.
        :param partition_ids: List of partition IDs to load. If None, load all.
        """
        self.file_paths = file_paths
        self.partition_ids = partition_ids

    def load_and_concat(self):
        """Load and concatenate Polars DataFrames from specified file paths."""
        if self.partition_ids is not None:
            # Filter file paths to include only specified partitions
            selected_files = [
                fp for fp in self.file_paths
                if any(f'partition_id={pid}' in fp for pid in self.partition_ids)
            ]
        else:
            selected_files = self.file_paths

        # Load each Parquet file into a Polars DataFrame
        partitioned_data = [pl.read_parquet(file_path) for file_path in selected_files]
        
        # Concatenate all DataFrames into one
        df = pl.concat(partitioned_data, rechunk=False)
        
        # Delete individual DataFrames to free memory
        del partitioned_data
        gc.collect()
        
        return df

# Specify partition IDs to load (8, 9)
partition_ids = [8, 9]

# Get all file paths sorted (adjust the glob pattern as needed)
file_paths_all = sorted(glob.glob('Data/train.parquet/*/*.parquet'))

# Initialize the loader with selected partitions
loader = LoadData(file_paths=file_paths_all, partition_ids=partition_ids)

# Load and concatenate the selected partitions
df_selected = loader.load_and_concat()

# Free up memory by deleting the loader and file paths
del loader, file_paths_all
gc.collect()

# Convert to Pandas for easier manipulation
df_selected_pd = polars_to_pandas(df_selected)

# Free up memory by deleting the Polars DataFrame
del df_selected
gc.collect()

# Sort by 'date_id' ascending, then by 'time_id' ascending
df_selected_pd.sort_values(['date_id', 'time_id'], inplace=True)
df_selected_pd.reset_index(drop=True, inplace=True)

# Invoke garbage collection
gc.collect()

# Identify unique date_ids sorted in ascending order
unique_date_ids_sorted = np.sort(df_selected_pd['date_id'].unique())

# Total number of unique days
total_unique_days = len(unique_date_ids_sorted)
print(f"Total unique days in selected partitions: {total_unique_days}")

# Define the number of validation days
validation_days = 100

# Determine the cutoff index
cutoff_index = total_unique_days - validation_days

# Get the cutoff date_id
validation_start_date_id = unique_date_ids_sorted[cutoff_index]
print(f"Validation will start from date_id: {validation_start_date_id}")

# Split into training and validation sets based on date_id
train_df = df_selected_pd[df_selected_pd['date_id'] < validation_start_date_id].copy()
validation_df = df_selected_pd[df_selected_pd['date_id'] >= validation_start_date_id].copy()

# Free up memory by deleting the sorted DataFrame
del df_selected_pd
gc.collect()

# Define non-informative features (modify based on your dataset)
excluded_features = [col for col in train_df.columns if col.startswith('responder_')] + ['weight']

# Define feature columns
feature_cols = [col for col in train_df.columns if col not in excluded_features]

print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Optimize memory by downcasting
train_df = optimize_memory(train_df)
validation_df = optimize_memory(validation_df)

# Training set
X_train = train_df[feature_cols].astype(np.float32)
y_train = train_df['responder_6'].astype(np.float32)
w_train = train_df['weight'].astype(np.float32)

# Validation set
X_val = validation_df[feature_cols].astype(np.float32)
y_val = validation_df['responder_6'].astype(np.float32)
w_val = validation_df['weight'].astype(np.float32)

# Free up memory by deleting the DataFrames
del train_df, validation_df
gc.collect()

# Create DMatrix for training and validation
dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
dval = xgb.DMatrix(X_val, label=y_val, weight=w_val)

# Free up memory by deleting the Pandas DataFrames
gc.collect()

def weighted_r2_metric(preds, dtrain):
    """Custom weighted R² evaluation metric for XGBoost."""
    y_true = dtrain.get_label()
    w = dtrain.get_weight()
    numerator = np.sum(w * (y_true - preds) ** 2)
    denominator = np.sum(w * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return 'weighted_r2', r2

# Best hyperparameters from your grid search (16th combination)
best_params = {
    'max_depth': 6,
    'learning_rate': 0.0126,
    'subsample': 0.6919,
    'colsample_bytree': 0.6527,
    'gamma': 0.3388,
    'min_child_weight': 1,
    'reg_alpha': 0.1218,
    'reg_lambda': 2.7785,
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'n_jobs': -1,
    'verbosity': 1,
    'seed': 42
}

# Set up MLflow
mlflow.set_tracking_uri("http://localhost:5000")  # Update if different
mlflow.set_experiment('JSF 8, 9 Last Two Partitions')

# Disable automatic MLflow logging to customize
mlflow.xgboost.autolog(disable=True)

with mlflow.start_run(run_name='Final Training with all Partitions'):

    # Log hyperparameters
    mlflow.log_params(best_params)
    
    # Define evaluation sets
    evals = [(dtrain, 'train'), (dval, 'validation')]
    
    # Train the model
    model = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=evals,
        early_stopping_rounds=10,
        custom_metric=weighted_r2_metric,
        maximize=True,
        verbose_eval=10
    )
    
    # Log the best iteration
    mlflow.log_metric('best_iteration', model.best_iteration)
    
    # Log custom metrics
    y_pred_val = model.predict(dval)
    weighted_r2_val = 1 - (np.sum(w_val * (y_val - y_pred_val) ** 2) / np.sum(w_val * y_val ** 2))
    mlflow.log_metric('validation_weighted_r2', weighted_r2_val)
    
    y_pred_train = model.predict(dtrain)
    weighted_r2_train = 1 - (np.sum(w_train * (y_train - y_pred_train) ** 2) / np.sum(w_train * y_train ** 2))
    mlflow.log_metric('training_weighted_r2', weighted_r2_train)
    
    rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val, sample_weight=w_val))
    mlflow.log_metric('validation_rmse', rmse_val)
    
    # Optionally save and log the model
    joblib.dump(model, 'final_model.joblib')
    mlflow.log_artifact('final_model.joblib')
    
    # Clean up
    del dtrain, dval, model, y_pred_val, y_pred_train, weighted_r2_val, weighted_r2_train, rmse_val
    gc.collect()

Total unique days in selected partitions: 339
Validation will start from date_id: 1599
Number of features: 82
Features: ['date_id', 'time_id', 'symbol_id', 'feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'fea

2024/12/02 21:30:33 INFO mlflow.tracking.fluent: Experiment with name 'JSF 8, 9 Last Two Partitions' does not exist. Creating a new experiment.


[0]	train-rmse:0.82225	train-weighted_r2:0.00029	validation-rmse:0.79303	validation-weighted_r2:0.00009
[10]	train-rmse:0.82119	train-weighted_r2:0.00285	validation-rmse:0.79272	validation-weighted_r2:0.00086
[20]	train-rmse:0.82027	train-weighted_r2:0.00510	validation-rmse:0.79247	validation-weighted_r2:0.00150
[30]	train-rmse:0.81945	train-weighted_r2:0.00707	validation-rmse:0.79225	validation-weighted_r2:0.00206
[40]	train-rmse:0.81871	train-weighted_r2:0.00886	validation-rmse:0.79206	validation-weighted_r2:0.00254
[50]	train-rmse:0.81803	train-weighted_r2:0.01052	validation-rmse:0.79189	validation-weighted_r2:0.00295
[60]	train-rmse:0.81743	train-weighted_r2:0.01196	validation-rmse:0.79176	validation-weighted_r2:0.00328
[70]	train-rmse:0.81689	train-weighted_r2:0.01327	validation-rmse:0.79164	validation-weighted_r2:0.00360
[80]	train-rmse:0.81639	train-weighted_r2:0.01449	validation-rmse:0.79153	validation-weighted_r2:0.00386
[90]	train-rmse:0.81593	train-weighted_r2:0.01559	valida

2024/12/02 21:38:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run Final Training with all Partitions at: http://localhost:5000/#/experiments/37/runs/88b557e7ee2344bfbd85b91a528960e3.
2024/12/02 21:38:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/37.


### Training the Final Model:

In [None]:
import glob
import polars as pl
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
import joblib
import gc
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

def polars_to_pandas(polars_df):
    """Convert a Polars DataFrame to a Pandas DataFrame."""
    return polars_df.to_pandas()

def optimize_memory(df):
    """Downcast numerical columns to reduce memory usage."""
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def plot_feature_importance(importance_df, title='Feature Importance'):
    """Plot and save feature importance."""
    plt.figure(figsize=(10, 6))
    plt.bar(importance_df['Feature'], importance_df['Importance'], color='skyblue')
    plt.title(title)
    plt.xlabel('Features')
    plt.ylabel('Importance (Gain)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plot_filename = f"{title.replace(' ', '_').lower()}.png"
    plt.savefig(plot_filename)
    plt.close()
    return plot_filename

class LoadData:
    def __init__(self, file_paths, partition_ids=None):
        """
        Initialize the LoadData class.

        :param file_paths: List of all file paths.
        :param partition_ids: List of partition IDs to load. If None, load all.
        """
        self.file_paths = file_paths
        self.partition_ids = partition_ids

    def load_and_concat(self):
        """Load and concatenate Polars DataFrames from specified file paths."""
        if self.partition_ids is not None:
            # Filter file paths to include only specified partitions
            selected_files = [
                fp for fp in self.file_paths
                if any(f'partition_id={pid}' in fp for pid in self.partition_ids)
            ]
        else:
            selected_files = self.file_paths

        # Load each Parquet file into a Polars DataFrame
        partitioned_data = [pl.read_parquet(file_path) for file_path in selected_files]
        
        # Concatenate all DataFrames into one
        df = pl.concat(partitioned_data, rechunk=False)
        
        # Delete individual DataFrames to free memory
        del partitioned_data
        gc.collect()
        
        return df

# Specify partition IDs to load (5, 6, 7, 8, 9)
partition_ids = [5, 6, 7, 8, 9]

# Get all file paths sorted (adjust the glob pattern as needed)
file_paths_all = sorted(glob.glob('Data/train.parquet/*/*.parquet'))

# Initialize the loader with selected partitions
loader = LoadData(file_paths=file_paths_all, partition_ids=partition_ids)

# Load and concatenate the selected partitions
df_selected = loader.load_and_concat()

# Free up memory by deleting the loader and file paths
del loader, file_paths_all
gc.collect()

# Convert to Pandas for easier manipulation
df_selected_pd = polars_to_pandas(df_selected)

# Free up memory by deleting the Polars DataFrame
del df_selected
gc.collect()

# Sort by 'date_id' ascending, then by 'time_id' ascending
df_selected_pd.sort_values(['date_id', 'time_id'], inplace=True)
df_selected_pd.reset_index(drop=True, inplace=True)

# Invoke garbage collection
gc.collect()

# Split into training and validation sets based on date_id
train_df = df_selected_pd.copy()

# Free up memory by deleting the sorted DataFrame
del df_selected_pd
gc.collect()

# Define non-informative features (modify based on your dataset)
excluded_features = [col for col in train_df.columns if col.startswith('responder_')] + ['weight']

# Define feature columns
feature_cols = [col for col in train_df.columns if col not in excluded_features]

print(f"Number of features: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Optimize memory by downcasting
train_df = optimize_memory(train_df)

# Training set
X_train = train_df[feature_cols].astype(np.float32)
y_train = train_df['responder_6'].astype(np.float32)
w_train = train_df['weight'].astype(np.float32)

# Free up memory by deleting the DataFrames
del train_df
gc.collect()

# Create DMatrix for training and validation
dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)

# Free up memory by deleting the Pandas DataFrames
gc.collect()

def weighted_r2_metric(preds, dtrain):
    """Custom weighted R² evaluation metric for XGBoost."""
    y_true = dtrain.get_label()
    w = dtrain.get_weight()
    numerator = np.sum(w * (y_true - preds) ** 2)
    denominator = np.sum(w * y_true ** 2)
    r2 = 1 - (numerator / denominator)
    return 'weighted_r2', r2

# Best hyperparameters from your grid search (16th combination)
best_params = {
    'max_depth': 6,
    'learning_rate': 0.0126,
    'subsample': 0.6919,
    'colsample_bytree': 0.6527,
    'gamma': 0.3388,
    'min_child_weight': 1,
    'reg_alpha': 0.1218,
    'reg_lambda': 2.7785,
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'n_jobs': -1,
    'verbosity': 1,
    'seed': 42
}

# Set up MLflow
mlflow.set_tracking_uri("http://localhost:5000")  # Update if different
mlflow.set_experiment('JSF 5, 6, 7, 8, 9 Old HP Final')

# Disable automatic MLflow logging to customize
mlflow.xgboost.autolog(disable=True)

with mlflow.start_run(run_name='Final Training with all Partitions'):

    # Log hyperparameters
    mlflow.log_params(best_params)
    
    # Define evaluation sets
    evals = [(dtrain, 'train')]
    
    # Train the model
    model = xgb.train(
        params=best_params,
        dtrain=dtrain,
        num_boost_round=447,
        evals=evals,
        early_stopping_rounds=10,
        custom_metric=weighted_r2_metric,
        maximize=True,
        verbose_eval=10
    )
    
    # Log the best iteration
    mlflow.log_metric('best_iteration', model.best_iteration)
    
    y_pred_train = model.predict(dtrain)
    weighted_r2_train = 1 - (np.sum(w_train * (y_train - y_pred_train) ** 2) / np.sum(w_train * y_train ** 2))
    mlflow.log_metric('training_weighted_r2', weighted_r2_train)
    
    # Optionally save and log the model
    joblib.dump(model, 'final_model.joblib')
    mlflow.log_artifact('final_model.joblib')
    
    # Clean up
    del dtrain, y_pred_train, weighted_r2_train
    gc.collect()

In [None]:
model.save_model('oldhp_0071_model.json')