# XGBoost Extended Best Model Framewise prediction
> Created Feb. 2025 <br>
> Nikhil Bisht<br>

In [1]:
# standard system modules
import os, sys
os.environ["PATH"] += os.pathsep + "/home/nbisht/myapps/bin/"
import h5py 
import argparse
# standard module for tabular data
import pandas as pd

# standard module for array manipulation
import numpy as np
from itertools import permutations

# standard statistical module
import scipy.stats as st
from scipy import linalg
from scipy.stats import ks_2samp


# standard module for high-quality plots
from PIL import Image
import matplotlib as mp
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
mp.rcParams.update(mp.rcParamsDefault)
%matplotlib inline

# to plot pixelized images
import imageio.v3 as im

# standard research-level machine learning toolkit from Meta (FKA: FaceBook)
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
import tables
import torchvision
from torch.utils.data import DataLoader, TensorDataset
from torchvision.utils import save_image
from sklearn.model_selection import train_test_split,RepeatedKFold,cross_val_score
import xgboost as xgb

import sklearn.metrics as skm

from tqdm import tqdm

# set a seed to ensure reproducibility
seed = 128
rnd  = np.random.RandomState(seed)

DATAFILE  = '/data/cb1/nbisht/anvil_scratch/projects/128/B2/datasets/nb101_ML_dataset_AllData_AutoEnc.h5'
CORESET  = '/data/cb1/nbisht/anvil_scratch/projects/128/B2/datasets/nb101_all_frames.h5'
MODELFILE = 'nnmodel.json'
MCDSO_CORE = '/data/cb1/nbisht/anvil_scratch/projects/128/B2/datasets/nb101_ML_dataset_XGBoost_MCDS0_core.csv'
MCDSO_NONCORE = '/data/cb1/nbisht/anvil_scratch/projects/128/B2/datasets/nb101_ML_dataset_XGBoost_MCDS0_noncore.csv'

IMAGESIZE = 128

#In percentage
NTRAIN_percentage =  0.80
NVALID_percentage =  0.05
NTEST_percentage  =  0.20

ALL_COLUMNS = ['Particle_id', 'Initial_Frame', 'X_i', 'Y_i', 'Z_i', 'Vx_i', 'Vy_i', 'Vz_i', 'Density_i', 'X_f', 'Y_f', 'Z_f']#, 'Vx_f', 'Vy_f', 'Vz_f', 'Density_f']
FEATURES = ALL_COLUMNS[1:9]
TARGET = ALL_COLUMNS[9:]

FRAMESTRAIN = np.arange(20,80, 1)
FRAMESTEST = np.arange(80,91, 1)
FRAMES = np.concatenate((FRAMESTRAIN, FRAMESTEST))
FRAME_DIFF = 30

NUM_X_TRAIN = 13

#DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device('cpu')

print(f'Available device: {str(DEVICE):4s}')

do_core = 1
if do_core == 1:
    df_timeseries = pd.read_csv(MCDSO_CORE)
    model_prefix = 'Core'
elif do_core == 0:
    df_timeseries = pd.read_csv(MCDSO_NONCORE)
    model_prefix = 'NonCore'
else:
    df_timeseries_core = pd.read_csv(MCDSO_CORE)
    df_timeseries_noncore = pd.read_csv(MCDSO_CORE)
    df_timeseries = pd.concat([df_timeseries_core, df_timeseries_noncore], axis=0)
    model_prefix = 'Combined'
df_timeseries

Available device: cpu 


Unnamed: 0,Particle_id,Initial_Frame,X_i,Y_i,Z_i,Vx_i,Vy_i,Vz_i,Density_i,X_f,Y_f,Z_f
0,1405,20,0.527180,0.158741,0.971439,6.421220,-1.876194,-5.372949,0.277539,0.574790,0.162447,0.928789
1,1488,20,0.100503,0.200606,0.966166,-4.441785,2.345180,-4.707516,0.974011,0.046665,0.217869,0.931834
2,1489,20,0.109401,0.206609,0.968732,-4.272087,3.115164,-4.167612,0.850971,0.060391,0.236572,0.929803
3,1490,20,0.118114,0.207084,0.971037,-4.146288,3.527104,-3.902281,0.941309,0.066567,0.240239,0.936063
4,1531,20,0.506000,0.168704,0.994022,6.933440,-1.068680,-1.719031,0.239841,0.579576,0.164281,0.945365
...,...,...,...,...,...,...,...,...,...,...,...,...
8215475,2095843,89,0.835269,0.284902,0.339996,3.356818,4.506277,-1.690823,12.974862,0.867902,0.336165,0.311914
8215476,2095871,89,0.107716,0.508404,0.352204,4.872550,3.780446,-8.586980,1.512863,0.139102,0.541064,0.257406
8215477,2095906,89,0.828956,0.303926,0.344030,3.392380,4.518692,-1.866807,6.594403,0.865761,0.353138,0.315775
8215478,2096034,89,0.838114,0.293587,0.338315,3.059779,4.365908,-1.606220,14.772669,0.870565,0.343483,0.311484


### Functions

In [2]:
def plot_prediction(y_true,y_pred, mae, r2, append=''):
    fig = plt.figure(figsize=(12, 4))
    for i in range(3):
        ax = plt.subplot(1, 3, i+1)
        ax.scatter(y_true[:,i], y_pred[:,i], c='crimson', s=1e-2)
        ax.plot([0, 0], [1, 1], 'b-')
        ax.set_xlabel('True Values', fontsize=15)
        ax.set_ylabel('Predictions', fontsize=15)
        ax.set_title(f'{TARGET[i]}: MAE = {mae[i]:.6f}, R2 = {r2[i]:.4f}', fontsize=12)
        ax.set_ylabel(TARGET[i])
        ax.set_xlim([-0.1,1.1])
        ax.set_ylim([-0.1,1.1])
    fig.tight_layout()
    plt.savefig('Particles_All_Frames_Frame'+str(FRAMES[0])+'_to_Frame'+str(FRAMES[-1])+'_Prediction'+append+'.png')
    plt.show()

def df_transform(df_test, distance_condition = 0.5):
    df = df_test.copy()
    df['del_X'] = df['X_f'] - df['X_i']
    df['del_Y'] = df['Y_f'] - df['Y_i']
    df['del_Z'] = df['Z_f'] - df['Z_i']
    df['X_op'] = np.where(np.abs(df['X_i'])<distance_condition, -1, +1)
    df['Y_op'] = np.where(np.abs(df['Y_i'])<distance_condition, -1, +1)
    df['Z_op'] = np.where(np.abs(df['Z_i'])<distance_condition, -1, +1)
    df['X_f'] = np.where(np.abs(df['del_X'])<distance_condition, df['X_f'], df['X_f']+df['X_op'])
    df['Y_f'] = np.where(np.abs(df['del_Y'])<distance_condition, df['Y_f'], df['Y_f']+df['Y_op'])
    df['Z_f'] = np.where(np.abs(df['del_Z'])<distance_condition, df['Z_f'], df['Z_f']+df['Z_op'])
    return df

def df_inverse_transform(df_test, distance_condition = 0.5):
    df = df_test.copy()
    df['X_f'] = np.where(df['X_f']>=1, df['X_f']-1, df['X_f'])
    df['X_f'] = np.where(df['X_f']<0, df['X_f']+1, df['X_f'])
    df['Y_f'] = np.where(df['Y_f']>=1, df['Y_f']-1, df['Y_f'])
    df['Y_f'] = np.where(df['Y_f']<0, df['Y_f']+1, df['Y_f'])
    df['Z_f'] = np.where(df['Z_f']>=1, df['Z_f']-1, df['Z_f'])
    df['Z_f'] = np.where(df['Z_f']<0, df['Z_f']+1, df['Z_f'])
    return df


def model_test(model, test_df, distance_condition = 0.5):
    y_pred = model.predict(test_df[FEATURES])
    y_pred_df = pd.DataFrame({'X_f': y_pred[:, 0], 'Y_f': y_pred[:, 1], 'Z_f': y_pred[:, 2]})
    y_pred_df =  df_inverse_transform(y_pred_df, distance_condition = distance_condition)
    
    return y_pred_df

## Split data

In [3]:
df_timeseries['Density_i'] = np.log10(df_timeseries['Density_i'])
df_timeseries_train = df_timeseries[df_timeseries['Initial_Frame'].isin(FRAMESTRAIN)]
df_timeseries_test = df_timeseries[df_timeseries['Initial_Frame'].isin(FRAMESTEST)]
df_timeseries_transformed = df_transform(df_timeseries_train)
X_train, y_train = df_timeseries_transformed[FEATURES], df_timeseries_transformed[TARGET]
X_test, y_test = df_timeseries_test[FEATURES], df_timeseries_test[TARGET]


## Train model Framewise
https://xgboost.readthedocs.io/en/stable/parameter.html

In [4]:
n_estimators    = [500, 800, 1500]  
max_depth       = [8 ,8, 8]
eta             = [0.1, 0.1, 0.01]
subsample       = [1, 1, 1]   
gamma           = [0.5, 0.5, 0.5]

#training only the best model
best_models = [0 ,0 ,0]
for i in range(1):
    best_models[i] = xgb.XGBRegressor(n_estimators=n_estimators[i], max_depth=max_depth[i], eta=eta[i], subsample=subsample[i], gamma = gamma[i], device = 'cpu')
    best_models[i].fit(X_train, y_train)
    best_models[i].save_model(model_prefix+"_Extended_Framewise_BestModel"+str(i)+"_"+MODELFILE)

print(len(X_train))

7041840


## Load Model

In [5]:
def mae_modded(y_true, y_pred):
    mae = np.array([0.,0.,0.])
    mae_diff = np.abs(y_true - y_pred)
    mae_add = 1 - mae_diff
    stacked = np.stack([mae_diff, mae_add], axis=2)
    mae = stacked.min(axis=2).mean(axis=0)
    return mae

def r2_modded(y_true, y_pred):
    true_mean = np.mean(y_true,axis=0)
    mae_diff = np.abs(y_true - y_pred)
    mae_add = 1 - mae_diff
    stacked = np.stack([mae_diff, mae_add], axis=2)
    r2_residual = stacked.min(axis=2)**2
    mae_diff = np.abs(y_true - true_mean)
    mae_add = 1 - mae_diff
    stacked = np.stack([mae_diff, mae_add], axis=2)
    r2_total = stacked.min(axis=2)**2

    return 1-r2_residual.sum(axis=0)/r2_total.sum(axis=0)

In [6]:
model_prefix

'Core'

In [9]:
loaded_model = [0,0,0]
results_dic = {'Model_1_ypred':[], 'Model_2_ypred':[], 'Model_3_ypred':[], 'ytrue':y_test, 'xtest': X_test}
for i in range(1):
    loaded_model[i] = xgb.XGBRegressor()
    loaded_model[i].load_model(model_prefix+"_Extended_Framewise_BestModel"+str(i)+"_"+MODELFILE)
    ypred, ytrue = model_test(loaded_model[i], X_test), y_test
    results_dic['Model_'+str(i+1)+'_ypred'] = ypred
    #mae = mae_modded(ytrue.to_numpy(), ypred.to_numpy())
    #r2 = r2_modded(ytrue.to_numpy(),ypred.to_numpy())
    #plot_prediction(ytrue.to_numpy(),ypred.to_numpy(), mae, r2, append = "_"+model_prefix+"_Extended_All_Frames_BestModel"+str(i))



In [10]:
import pickle
with open('/data/cb1/nbisht/anvil_scratch/projects/128/B2/datasets/nb101_'+model_prefix+'_framewise_predictions.pickle', 'wb') as handle:
    pickle.dump(results_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)
