In [9]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import glob
import os
import gc
import xgboost as xgb
import mlflow
import mlflow.xgboost
import joblib
from sklearn.datasets import dump_svmlight_file

def read_parquet_in_chunks(file_path, columns=None, chunk_size=1_000_000):
    parquet_file = pq.ParquetFile(file_path)
    total_rows = parquet_file.metadata.num_rows
    for start_row in range(0, total_rows, chunk_size):
        end_row = min(start_row + chunk_size, total_rows)
        batch = parquet_file.read_row_group(0, columns=columns, use_threads=True).slice(start_row, end_row - start_row)
        yield batch.to_pandas()

# Initialize max_date
max_date = None
file_paths = sorted(glob.glob('Data/train.parquet/*/*.parquet'))

In [10]:
# Initialize the LIBSVM file
train_libsvm_file = 'final_train.libsvm'

# Define the window size for rolling mean
window_size = 800

# Process each Parquet file
for file_idx, file_path in enumerate(file_paths):
    print(f"Processing file {file_idx+1}/{len(file_paths)}: {file_path}")
    
    # Read the file in chunks
    for df_chunk in read_parquet_in_chunks(file_path, chunk_size=1_000_000):
        
        df_chunk = df_chunk.sort_values(['date_id', 'time_id'])
        
        # Ensure feature columns are float64 to avoid dtype issues
        df_chunk = df_chunk.astype('float64')

        # Exclude columns starting with 'responder_'
        feature_cols = [col for col in df_chunk.columns if not col.startswith('responder_')]


        feature_names_pickle = 'feature_columns.pkl'
        pd.to_pickle(feature_cols, feature_names_pickle)
        
        # Process training data
        if not df_chunk.empty:
            X_train = df_chunk[feature_cols]
            y_train = df_chunk['responder_6']

            # Impute NaN values using backward-looking rolling mean
            X_train_imputed = X_train.copy()
            for col in feature_cols:
                # Create a mask of NaN values
                na_mask = X_train_imputed[col].isna()
                if na_mask.any():
                    # Forward fill to handle initial NaNs
                    col_ffill = X_train_imputed[col].ffill()
                    # Compute rolling mean (looking back)
                    col_roll_mean = col_ffill.rolling(window=window_size, min_periods=1).mean()
                    # Ensure data types are consistent
                    col_roll_mean = col_roll_mean.astype(X_train_imputed[col].dtype)
                    # Fill NaNs in the original series with rolling mean values
                    X_train_imputed.loc[na_mask, col] = col_roll_mean[na_mask].astype(X_train_imputed[col].dtype)
                    # If there are still NaNs (e.g., at the start), fill with overall mean
                    if X_train_imputed[col].isna().any():
                        overall_mean = X_train_imputed[col].mean()
                        X_train_imputed[col] = X_train_imputed[col].fillna(overall_mean)
            # Convert target variable to float64
            y_train = y_train.astype('float64')

            # Ensure no remaining NaNs
            X_train_imputed = X_train_imputed.fillna(0)
            
            # Append to train LIBSVM file
            with open(train_libsvm_file, 'ab') as f_train:
                dump_svmlight_file(X_train_imputed, y_train, f_train, zero_based=True)

        # Clean up
        del df_chunk
        gc.collect()

        # Clean up training variables
        if 'train_chunk' in locals():
            del df_chunk
        if 'X_train' in locals():
            del X_train, y_train, X_train_imputed

        gc.collect()

Processing file 1/10: Data/train.parquet\partition_id=0\part-0.parquet
Processing file 2/10: Data/train.parquet\partition_id=1\part-0.parquet
Processing file 3/10: Data/train.parquet\partition_id=2\part-0.parquet
Processing file 4/10: Data/train.parquet\partition_id=3\part-0.parquet
Processing file 5/10: Data/train.parquet\partition_id=4\part-0.parquet
Processing file 6/10: Data/train.parquet\partition_id=5\part-0.parquet
Processing file 7/10: Data/train.parquet\partition_id=6\part-0.parquet
Processing file 8/10: Data/train.parquet\partition_id=7\part-0.parquet
Processing file 9/10: Data/train.parquet\partition_id=8\part-0.parquet
Processing file 10/10: Data/train.parquet\partition_id=9\part-0.parquet


In [11]:
os.rename(train_libsvm_file, 'final_train.libsvm.cache')

In [12]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment('XGBoost Full Dataset Training 2')

optimal_num_boost_round = 24

params = {
    'max_depth': 6,
    'learning_rate': 0.0126,
    'subsample': 0.6919,
    'colsample_bytree': 0.6527,
    'gamma': 0.3388,
    'min_child_weight': 1,
    'reg_alpha': 0.1218,
    'reg_lambda': 2.7785,
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'n_jobs': -1,
    'verbosity': 1,
    'seed': 42
}

mlflow.xgboost.autolog(disable=True)

with mlflow.start_run(run_name='Final Model Training'):

    mlflow.log_params(params)
    mlflow.log_param('optimal_num_boost_round', optimal_num_boost_round)
    
    dfull = xgb.DMatrix('final_train.libsvm.cache?format=libsvm')
    
    # Since we don't have a validation set, we'll just monitor training metrics
    evals = [(dfull, 'train')]
    
    # Train the model without early stopping
    final_model = xgb.train(
        params,
        dfull,
        num_boost_round=optimal_num_boost_round,
        evals=evals,
        verbose_eval=10
    )
    
    # Save and log the final model using joblib
    joblib.dump(final_model, 'final_model.joblib')
    mlflow.log_artifact('final_model.joblib')
    
    # Log the model with MLflow
    feature_names = pd.read_pickle('feature_columns.pkl')
    input_example = pd.DataFrame([[0]*len(feature_names)], columns=feature_names, dtype='float64')
    
    mlflow.xgboost.log_model(final_model, artifact_path='model', input_example=input_example)
    
    # Log feature importance
    importance = final_model.get_score(importance_type='gain')
    importance_df = pd.DataFrame({
        'feature': list(importance.keys()),
        'importance': list(importance.values())
    })
    importance_df.to_csv('feature_importance_full.csv', index=False)
    mlflow.log_artifact('feature_importance_full.csv')
    
    print("Final model training completed and logged.")

2024/11/24 21:47:57 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost Full Dataset Training 2' does not exist. Creating a new experiment.


[0]	train-rmse:0.97634
[10]	train-rmse:0.97542
[20]	train-rmse:0.97457
[23]	train-rmse:0.97436




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2024/11/24 21:48:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run Final Model Training at: http://localhost:5000/#/experiments/11/runs/3554add1244c466896fcac9a918a224c.
2024/11/24 21:48:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/11.


Final model training completed and logged.


In [13]:
final_model.save_model('final_model.json')