In [1]:
# python based
import tensorflow as tf
from pathlib import Path
import time
import shutil
import random
import os
import pandas as pd
import numpy as np
from tensorflow.keras.optimizers import Adam, Nadam
import matplotlib.pyplot as plt

# custom 
from loss import *
from models import *
from dataloaders import utils
from dataloaders import OptimizedDataGenerator as DG

2024-11-26 08:24:29.851355: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-26 08:24:29.851426: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-26 08:24:29.852915: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-26 08:24:29.861091: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print("Num CPU:", os.cpu_count())
print(utils.check_GPU())

Num CPU: 128
1 Physical GPUs, 1 Logical GPUs
None


2024-11-26 08:24:33.529392: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 33857 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB MIG 7g.40gb, pci bus id: 0000:21:00.0, compute capability: 8.0


In [3]:
def load_sensor_geometries(file_path):
    sensor_geometries = {}
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line and not line.startswith("#"):  # Skip comments and empty lines
                dataset_type, geometry = line.split(": ")
                sensor_geometries[dataset_type.strip()] = geometry.strip()
    return sensor_geometries
sensor_geometries = load_sensor_geometries('types.txt')

In [4]:
dataset_name = 'dataset2s'
type = 'type6'
print(sensor_geometries[f'{dataset_name} {type}'])

100 um x 25 um x 100 um sensor @ 100V


In [5]:
# dataset_path = '/depot/cms/users/dkondra/smart-pixels/dataset8/unflipped-positive'
dataset_path = f'/depot/cms/users/das214/{dataset_name}/{type}/unflipped/'
data_directory_path = os.path.join(dataset_path, 'recon3D/')
labels_directory_path = os.path.join(dataset_path, 'labels/')

data_files_path_list = [os.path.join(data_directory_path, f) for f in os.listdir(data_directory_path)]
labels_files_path_list = [os.path.join(labels_directory_path, f) for f in os.listdir(labels_directory_path)]

data_files_path_list = np.sort(data_files_path_list)
labels_files_path_list = np.sort(labels_files_path_list)

print(data_directory_path)
print(labels_directory_path)
print(len(data_files_path_list))
print(len(labels_files_path_list))

/depot/cms/users/das214/dataset2s/type6/unflipped/recon3D/
/depot/cms/users/das214/dataset2s/type6/unflipped/labels/
41
41


In [6]:
output_directory = Path("./").resolve()

batch_size = 5000
val_batch_size = 5000
train_file_size = 35
val_file_size = 6

# batch_size = 500
# val_batch_size = 500
# train_file_size = 20 
# val_file_size = 6 

In [7]:
os.makedirs(output_directory, exist_ok=True)
print(output_directory)

/home/das214/SmartPix/dataset2s


In [8]:
# create tf records directory (random)
stamp = '%08x' % random.randrange(16**8)
stamp = 1
tfrecords_dir_train = Path(output_directory, f"tfrecords_train_{stamp}").resolve()
tfrecords_dir_validation = Path(output_directory, f"tfrecords_validation_{stamp}").resolve()

# Path where the TFRecord files will be saved (deterministic)
tfrecords_dir_train = f'/depot/cms/users/das214/{dataset_name}/{type}/unflipped/TFR_20t_train'
tfrecords_dir_validation = f'/depot/cms/users/das214/{dataset_name}/{type}/unflipped/TFR_20t_val'

print(tfrecords_dir_train)
print(tfrecords_dir_validation)
# clean up tf records
# utils.safe_remove_directory(tfrecords_dir_train)
# utils.safe_remove_directory(tfrecords_dir_validation)

/depot/cms/users/das214/dataset2s/type6/unflipped/TFR_20t_train
/depot/cms/users/das214/dataset2s/type6/unflipped/TFR_20t_val


In [9]:
# # validation generator

# # Caution: If you want to load older TFRecord files dont run like this instead use `load_from_tfrecords_dir`
# #       Or else if there exist and data at `tfrecords_dir` will be removed.

# start_time = time.time()
# validation_generator = DG.OptimizedDataGenerator(
#     data_directory_path = data_directory_path,
#     labels_directory_path = labels_directory_path,
#     is_directory_recursive = False,
#     file_type = "parquet",
#     data_format = "3D",
#     batch_size = val_batch_size,
#     file_count = val_file_size,
#     to_standardize= True,
#     include_y_local= True, 
#     labels_list = ['x-midplane','y-midplane','cotAlpha','cotBeta'],
#     input_shape = (20,13,21), # (2,13,21),
#     transpose = (0,2,3,1),
#     shuffle = False, 
#     files_from_end=True,

#     tfrecords_dir = tfrecords_dir_validation,
#     use_time_stamps =  -1, # [0, 19],
#     max_workers = 2 # Don't make this too large (will use up all RAM)
# )

# print("--- Validation generator %s seconds ---" % (time.time() - start_time))

In [10]:
# # training generator

# # Caution: If you want to load older TFRecord files dont run like this instead use `load_from_tfrecords_dir`
# #       Or else if there exist and data at `tfrecords_dir` will be removed.


# start_time = time.time()
# training_generator = DG.OptimizedDataGenerator(
#     data_directory_path = data_directory_path,
#     labels_directory_path = labels_directory_path,
#     is_directory_recursive = False,
#     file_type = "parquet",
#     data_format = "3D",
#     batch_size = batch_size,
#     file_count = train_file_size,
#     to_standardize= True,
#     include_y_local= True,
#     labels_list = ['x-midplane','y-midplane','cotAlpha','cotBeta'],
#     input_shape = (20,13,21), # (2,13,21),
#     transpose = (0,2,3,1),
#     shuffle = False, # True 

#     tfrecords_dir = tfrecords_dir_train,
#     use_time_stamps =  -1, # [0, 19],
#     max_workers = 2 # Don't make this too large (will use up all RAM)
# )
# print("--- Training generator %s seconds ---" % (time.time() - start_time))

In [11]:
# This cell can be commented out entirely
# This cell shows the implementation of how to load TFRecord files if they are already initialized earlier
# Letting the user load from older files saving time (from preprocessing and saving)

training_generator = DG.OptimizedDataGenerator(
    load_from_tfrecords_dir = tfrecords_dir_train,
    include_y_local= True,
    shuffle = True,
    seed = 13,
    quantize = True
)

validation_generator = DG.OptimizedDataGenerator(
    load_from_tfrecords_dir = tfrecords_dir_validation, 
    include_y_local= True,
    shuffle = True,
    seed = 13,
    quantize = True
)



In [12]:
def CreateModel(shape, n_filters, pool_size):
    x_base = x_in = Input(shape, name="X_input")  # Main input (X)
    y_local_in = Input(shape=(1,), name="y_local_input")  # y_local input
    
    stack = conv_network(x_base, n_filters)
    stack = AveragePooling2D(
        pool_size=(pool_size, pool_size), 
        strides=None, 
        padding="valid", 
        data_format=None,        
    )(stack)
    stack = QActivation("quantized_bits(8, 0, alpha=1)")(stack)
    stack = Flatten()(stack)
    # stack = Concatenate()([stack, y_local_in])
    stack = var_network(stack, hidden=16, output=14)
    model = Model(inputs=x_in, outputs=stack)
    return model

In [13]:
input_shape = (13, 21, 20)
model = CreateModel(input_shape, n_filters=5, pool_size=3)
model.summary()

2024-11-26 08:24:34.159023: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 X_input (InputLayer)        [(None, 13, 21, 20)]      0         
                                                                 
 q_separable_conv2d (QSepar  (None, 11, 19, 5)         285       
 ableConv2D)                                                     
                                                                 
 q_activation (QActivation)  (None, 11, 19, 5)         0         
                                                                 
 q_conv2d (QConv2D)          (None, 11, 19, 5)         30        
                                                                 
 q_activation_1 (QActivatio  (None, 11, 19, 5)         0         
 n)                                                              
                                                                 
 average_pooling2d (Average  (None, 3, 6, 5)           0     

In [14]:
model.compile(
    optimizer=tf.keras.optimizers.Nadam(learning_rate=1e-3),
    loss=custom_loss
)

In [15]:
fingerprint = '%08x' % random.randrange(16**8)
os.makedirs("trained_models", exist_ok=True)
base_dir = f'./trained_models/{type}/model-{fingerprint}-checkpoints'
os.makedirs(base_dir, exist_ok=True)  
checkpoint_filepath = base_dir + '/weights.{epoch:02d}-t{loss:.2f}-v{val_loss:.2f}.hdf5'

In [16]:
print(fingerprint)

c8fe3c7d


In [17]:
from tensorflow.keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint, Callback

early_stopping_patience = 50

class CustomModelCheckpoint(ModelCheckpoint):
    def on_epoch_end(self, epoch, logs=None):
        super().on_epoch_end(epoch, logs)
        checkpoints = [f for f in os.listdir(base_dir) if f.startswith('weights')]
        if len(checkpoints) > 1:
            checkpoints.sort()
            for checkpoint in checkpoints[:-1]:
                os.remove(os.path.join(base_dir, checkpoint))

es = EarlyStopping(patience=early_stopping_patience, restore_best_weights=True)

mcp = CustomModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    save_best_only=True,
    save_freq='epoch',
    verbose=1
)

csv_logger = CSVLogger(f'{base_dir}/training_log.csv', append=True)

In [18]:
# Fetch the first batch from the generator
X_batch, y_batch = training_generator.__getitem__(0)

In [19]:
# Check if X_batch is a list (since include_y_local=True)
if isinstance(X_batch, list):
    X, y_local = X_batch  # Unpack the list into X and y_local
    print("X shape:", X.shape)        # Should be [batch_size, 13, 21, 2]
    print("y_local shape:", y_local.shape)  # Should be [batch_size, 1] or [batch_size]
else:
    X = X_batch  # If include_y_local=False, it's just X
    print("X shape:", X.shape)  # Just X data shape

# Print the shape of the labels (y)
print("y_batch shape:", y_batch.shape)  # Should always be [batch_size, 4]

X shape: (5000, 13, 21, 20)
y_batch shape: (5000, 4)


In [None]:
model.fit(
    x=training_generator,
    validation_data=validation_generator,
    callbacks=[es, mcp, csv_logger],
    epochs=1000,
    shuffle=False,
    verbose=1
)

Epoch 1/1000


2024-11-26 08:24:39.436485: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2024-11-26 08:24:39.597802: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-11-26 08:24:40.013778: I tensorflow/core/util/cuda_solvers.cc:179] Creating GpuSolver handles for stream 0x564edfd268b0
2024-11-26 08:24:41.743622: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f8133b52d70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-11-26 08:24:41.743678: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-PCIE-40GB MIG 7g.40gb, Compute Capability 8.0
2024-11-26 08:24:41.755380: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1732605881.901755 1634181 device_compiler.h:186] Compile

Epoch 1: val_loss improved from inf to -3928.79443, saving model to ./trained_models/type6/model-c8fe3c7d-checkpoints/weights.01-t12709.70-v-3928.79.hdf5
Epoch 2/1000
Epoch 2: val_loss improved from -3928.79443 to -9787.60645, saving model to ./trained_models/type6/model-c8fe3c7d-checkpoints/weights.02-t-7741.33-v-9787.61.hdf5
Epoch 3/1000
Epoch 3: val_loss improved from -9787.60645 to -13747.80957, saving model to ./trained_models/type6/model-c8fe3c7d-checkpoints/weights.03-t-11901.41-v-13747.81.hdf5
Epoch 4/1000
Epoch 4: val_loss improved from -13747.80957 to -14217.14941, saving model to ./trained_models/type6/model-c8fe3c7d-checkpoints/weights.04-t-13935.37-v-14217.15.hdf5
Epoch 5/1000
Epoch 5: val_loss improved from -14217.14941 to -15821.27734, saving model to ./trained_models/type6/model-c8fe3c7d-checkpoints/weights.05-t-14149.19-v-15821.28.hdf5
Epoch 6/1000
Epoch 6: val_loss improved from -15821.27734 to -16344.02246, saving model to ./trained_models/type6/model-c8fe3c7d-checkp

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 293: val_loss did not improve from -29079.68555
Epoch 294/1000
Epoch 294: val_loss did not improve from -29079.68555
Epoch 295/1000
Epoch 295: val_loss did not improve from -29079.68555
Epoch 296/1000
Epoch 296: val_loss did not improve from -29079.68555
Epoch 297/1000
Epoch 297: val_loss improved from -29079.68555 to -29365.89648, saving model to ./trained_models/type6/model-c8fe3c7d-checkpoints/weights.297-t-27534.54-v-29365.90.hdf5
Epoch 298/1000
Epoch 298: val_loss did not improve from -29365.89648
Epoch 299/1000

In [None]:
# # clean up tf records
# utils.safe_remove_directory(tfrecords_dir_train)
# utils.safe_remove_directory(tfrecords_dir_validation)