In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

# Function to load and preprocess a single CSV file
def load_and_preprocess_csv(file_path):
    data = pd.read_csv(file_path)

    # Normalize counts for each slice_index
    data['count'] = data.groupby('slice_index')['count'].transform(lambda x: x / x.sum())

    # Normalize bin edges for each slice_index
    data['bin_start'] = data.groupby('slice_index')['bin_start'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    data['bin_end'] = data.groupby('slice_index')['bin_end'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

    # Pivot the data to create a 2D array
    pivot_data = data.pivot(index='slice_index', columns='bin_start', values='count').fillna(0)
    return pivot_data

# Function to pad arrays to the same shape
def pad_array(array, max_shape):
    pad_width = [(0, max_shape[i] - array.shape[i]) for i in range(len(array.shape))]
    padded_array = np.pad(array, pad_width, mode='constant', constant_values=0)
    return padded_array

# Function to load data from a directory
def load_data_from_directory(directory):
    positive_files = sorted([f for f in os.listdir(directory) if f.startswith('positive')])
    negative_files = sorted([f for f in os.listdir(directory) if f.startswith('negative')])
    
    signals = []
    labels = []
    max_shape = (0, 0)

    # First pass to determine the maximum shape
    for pos_file, neg_file in zip(positive_files, negative_files):
        pos_data = load_and_preprocess_csv(os.path.join(directory, pos_file))
        neg_data = load_and_preprocess_csv(os.path.join(directory, neg_file))
        combined_data = np.concatenate((pos_data, neg_data), axis=1)
        max_shape = (max(max_shape[0], combined_data.shape[0]), max(max_shape[1], combined_data.shape[1]))

    # Second pass to pad arrays to the maximum shape
    for pos_file, neg_file in zip(positive_files, negative_files):
        pos_data = load_and_preprocess_csv(os.path.join(directory, pos_file))
        neg_data = load_and_preprocess_csv(os.path.join(directory, neg_file))
        combined_data = np.concatenate((pos_data, neg_data), axis=1)
        padded_data = pad_array(combined_data, max_shape)
        signals.append(padded_data)
        labels.append(1 if 'Glitch_volumes' in directory else 0)
    
    return np.array(signals), np.array(labels)

# Load your data
merger_directory = '/home/arutkeerthi/Downloads/Glitchveto/Merger_volumes/1200MPC-2-Noisy'
glitch_directory = '/home/arutkeerthi/Downloads/Glitchveto/Glitch_volumes'

X_merger, y_merger = load_data_from_directory(merger_directory)
X_glitch, y_glitch = load_data_from_directory(glitch_directory)

# Combine merger and glitch data
X = np.concatenate((X_merger, X_glitch), axis=0)
y = np.concatenate((y_merger, y_glitch), axis=0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert labels to categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Define the CNN model
input_shape = (X_train.shape[1], X_train.shape[2])
inputs = Input(shape=input_shape)

x = Conv1D(filters=32, kernel_size=3, activation='relu')(inputs)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.25)(x)

x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Dropout(0.25)(x)

x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)

outputs = Dense(2, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

# Print training and validation accuracy per epoch
for epoch, acc in enumerate(history.history['accuracy'], 1):
    print(f'Epoch {epoch}: Training Accuracy = {acc}')

for epoch, val_acc in enumerate(history.history['val_accuracy'], 1):
    print(f'Epoch {epoch}: Validation Accuracy = {val_acc}')



2024-08-22 19:13:18.467200: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-22 19:13:18.647173: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-22 19:13:18.648852: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-22 19:14:39.454140: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-08-22 19:14:39.454639: W tensorflow/core/common_runtime/gpu/gpu_device.

Epoch 1/20


2024-08-22 19:14:40.367976: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 406155904 exceeds 10% of free system memory.




2024-08-22 19:14:42.125156: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 102198320 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy: 1.0
Epoch 1: Training Accuracy = 0.8977272510528564
Epoch 2: Training Accuracy = 0.9090909361839294
Epoch 3: Training Accuracy = 0.9090909361839294
Epoch 4: Training Accuracy = 0.9642857313156128
Epoch 5: Training Accuracy = 1.0
Epoch 6: Training Accuracy = 1.0
Epoch 7: Training Accuracy = 1.0
Epoch 8: Training Accuracy = 1.0
Epoch 9: Training Accuracy = 1.0
Epoch 10: Training Accuracy = 1.0
Epoch 11: Training Accuracy = 1.0
Epoch 12: Training Accuracy = 1.0
Epoch 13: Training Accuracy = 1.0
Epoch 14: Training Accuracy = 1.0
Epoch 15: Training Accuracy = 1.0
Epoch 16: Training Accuracy = 1.0
Epoch 17: Training Accuracy = 1.0
Epoch 18: Training Accuracy = 1.0
Epoch 19: Training Accuracy = 1.0
Epoch 20: Training Accuracy = 1.0
Epoch 1: Validation Accuracy =

2024-08-22 19:14:55.975407: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 102198320 exceeds 10% of free system memory.
