# DeepNN for Sink Particles
> Created Oct. 2024 <br>
> Nikhil Bisht<br>

In [19]:
# standard system modules
import os, sys

# standard module for tabular data
import pandas as pd

# standard module for array manipulation
import numpy as np

# standard statistical module
import scipy.stats as st

# standard module for high-quality plots
import matplotlib as mp
import matplotlib.pyplot as plt
mp.rcParams.update(mp.rcParamsDefault)
%matplotlib inline

# standard research-level machine learning toolkit from Meta (FKA: FaceBook)
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import xgboost as xgb


# set a seed to ensure reproducibility
seed = 128
rnd  = np.random.RandomState(seed)

## Constants

In [30]:
DATAFILE  = '/data/cb1/nbisht/anvil_scratch/projects/128/B2/datasets/final_88/nb101_ML_dataset.csv'
MODELFILE = 'nnmodel.dict'

NTRAIN = 1600000
NVALID =  100000
NTEST  =  300000 #roughly

TARGET = ['O_Clump_X', 'O_Clump_Y',	'O_Clump_Z', 'O_Clump_Vx', 'O_Clump_Vy', 'O_Clump_Vz', 'O_Clump_density','O_t_end']
FEATURES = ['Clump_id', 'X', 'Y', 'Z', 'Vx', 'Vy', 'Vz','Density', 't_hard']

n_input = len(FEATURES)
n_output = len(TARGET)

#DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device('cpu')

print(f'Available device: {str(DEVICE):4s}')
print(n_input, n_output)

Available device: cpu 
9 8


## Load data

In [42]:
df = pd.read_csv(DATAFILE)
df['Clump_id'] = df['Clump_id'].astype('category')
print(len(df))
#df = df.sample(frac=1).reset_index(drop=True)
df.head()
#BIN CLUMP_ID -1 to reduce number 

2097152


Unnamed: 0.1,Unnamed: 0,Clump_id,Particle_id,X,Y,Z,Density,Vx,Vy,Vz,t_hard,O_Clump_X,O_Clump_Y,O_Clump_Z,O_Clump_Vx,O_Clump_Vy,O_Clump_Vz,O_Clump_density,O_t_end
0,0,2,86076.0,0.339811,0.134624,0.137327,765.475687,-5.696933,5.085002,-2.849526,0.620028,0.328734,0.135963,0.128912,-6.392839,-6.392839,-2.518608,2706.596838,0.66003
1,1,2,86077.0,0.339542,0.134248,0.136532,936.301305,-6.040428,4.622971,-2.5833,0.620028,0.328734,0.135963,0.128912,-6.392839,-6.392839,-2.518608,2706.596838,0.66003
2,2,2,90172.0,0.338404,0.126431,0.135963,462.601489,-5.731839,4.998208,-2.695267,0.620028,0.328734,0.135963,0.128912,-6.392839,-6.392839,-2.518608,2706.596838,0.66003
3,3,2,90173.0,0.340331,0.124604,0.134357,693.797771,-4.973129,5.235317,-2.662311,0.620028,0.328734,0.135963,0.128912,-6.392839,-6.392839,-2.518608,2706.596838,0.66003
4,4,2,90174.0,0.340503,0.126074,0.134297,1088.844742,-5.258341,4.987108,-2.725896,0.620028,0.328734,0.135963,0.128912,-6.392839,-6.392839,-2.518608,2706.596838,0.66003


In [58]:
print(df[(df['Clump_id']==-1) & (df['O_Clump_density']<1)].shape)
print(df[(df['Clump_id']==-1) & (df['O_Clump_density']<10) &(df['O_Clump_density']>1)].shape)
print(df[(df['Clump_id']==-1) & (df['O_Clump_density']>10)].shape)

(836033, 19)
(917829, 19)
(322084, 19)


## Split data

In [32]:
X, y = df[FEATURES], df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Training

In [33]:
#%%writefile nnmodel.py
def define_and_train_model(n = 1000, cross_validation_nfold = 0):
    try:
        del model
    except:
        print("No model to delete")
    
    # Define hyperparameters
    params = {"objective": "reg:squarederror"}#, "tree_method": "gpu_hist"}
    evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]
    model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
    evals=evals,
    verbose_eval=10, # Every ten rounds
    early_stopping_rounds=50, #Activate early stopping
    )
    print()
    if cross_validation_nfold>0:
        results = xgb.cv(
        params, dtrain_reg,
        num_boost_round=n,
        nfold=cross_validation_nfold,
        verbose_eval=10, # Every ten rounds
        early_stopping_rounds=50
        )
        best_rmse = results['test-rmse-mean'].min()
        print(best_rmse)
    return model

In [34]:
import sklearn.metrics as skm
best_model_arr = []
#hyperparam_value H = 4 for 4 layer
#hyperparam_value H = 2 for 3 layer
#hyperparam_value H = 8 for 2 layer

#for hyperparam_value1 in range(1,8):
model = define_and_train_model()
# standard measures of model performance
preds = model.predict(dtest_reg)
rmse = skm.mean_squared_error(y_test, preds)
r2 = skm.r2_score(y_test,preds)
best_model_arr.append([rmse,r2])
best_model_arr

No model to delete
[0]	train-rmse:5004.02617	validation-rmse:5204.44315
[10]	train-rmse:1081.09042	validation-rmse:1086.98885
[20]	train-rmse:996.66426	validation-rmse:1016.59024
[30]	train-rmse:945.45160	validation-rmse:985.20053
[40]	train-rmse:911.48577	validation-rmse:967.47148
[50]	train-rmse:869.01377	validation-rmse:940.59217
[60]	train-rmse:830.08669	validation-rmse:919.16747
[70]	train-rmse:813.43084	validation-rmse:910.23946
[80]	train-rmse:798.75480	validation-rmse:907.18299
[90]	train-rmse:782.14812	validation-rmse:893.27999
[100]	train-rmse:768.52426	validation-rmse:886.08810
[110]	train-rmse:753.62569	validation-rmse:881.39144
[120]	train-rmse:742.83796	validation-rmse:876.66514
[130]	train-rmse:738.67286	validation-rmse:875.83496
[140]	train-rmse:730.22797	validation-rmse:873.83017
[150]	train-rmse:719.68550	validation-rmse:871.93645
[160]	train-rmse:712.13181	validation-rmse:868.67214
[170]	train-rmse:705.50265	validation-rmse:867.76947
[180]	train-rmse:699.48999	valida

[[np.float64(694785.0726698973), 0.9542890787124634]]

In [40]:
y_test

Unnamed: 0,O_Clump_X,O_Clump_Y,O_Clump_Z,O_Clump_Vx,O_Clump_Vy,O_Clump_Vz,O_Clump_density,O_t_end
1054566,0.408086,0.633221,0.227955,-5.671705,1.598211,0.400054,0.478138,0.66003
2045891,0.463613,0.697196,0.073843,-0.784485,-2.146411,2.440873,13.771288,0.66003
411347,0.409996,0.725613,0.934922,3.291459,3.698894,12.039750,1.142419,0.66003
107280,0.128204,0.319014,0.314650,0.124373,6.455392,-4.425851,4.999578,0.66003
1182050,0.595234,0.358236,0.164567,4.354500,7.966405,1.764891,3.088681,0.66003
...,...,...,...,...,...,...,...,...
748933,0.354782,0.977022,0.647754,-1.990553,-3.897920,1.776606,0.840297,0.66003
1976284,0.592209,0.846340,0.749609,-1.183154,-8.664893,-0.114422,4.320351,0.66003
432666,0.269657,0.790935,0.794834,3.540392,-1.619916,2.061668,0.710219,0.66003
972548,0.849318,0.636862,0.226485,-4.422912,4.054486,-4.451785,1.034859,0.66003


In [41]:
df_pred = pd.DataFrame(preds)
df_pred

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.410910,0.629174,0.225446,-4.937736,0.433898,1.163913,-25.135860,0.66003
1,0.475641,0.695251,0.087789,-0.542385,-1.989635,1.784079,-89.335678,0.66003
2,0.408603,0.725015,0.934263,4.861608,2.886668,9.615307,59.553650,0.66003
3,0.121887,0.321141,0.310517,-0.445632,6.201883,-4.490307,30.269085,0.66003
4,0.581967,0.350927,0.171533,4.114697,6.077081,1.346386,-4.254006,0.66003
...,...,...,...,...,...,...,...,...
524283,0.361623,1.014806,0.641325,-2.156694,-3.117491,1.180822,6.195176,0.66003
524284,0.591981,0.851572,0.753663,-0.996518,-8.537794,0.168475,99.821793,0.66003
524285,0.277468,0.798808,0.793386,3.689996,-1.440983,1.763452,173.939667,0.66003
524286,0.853057,0.625915,0.242378,-3.304081,2.898837,-3.553259,-11.833437,0.66003
