In [1]:
import tensorflow as tf

2024-05-14 07:58:49.693673: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-14 07:58:49.693830: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-14 07:58:49.896182: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import random

plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")

In [3]:
labels = pd.read_csv("../input/g2net-gravitational-wave-detection/training_labels.csv")


display(labels.head())

Unnamed: 0,id,target
0,00000e74ad,1
1,00001f4945,0
2,0000661522,0
3,00007a006a,0
4,0000a38978,1


In [4]:
pip install optree

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install nnAudio

Collecting nnAudio
  Downloading nnAudio-0.3.3-py3-none-any.whl.metadata (771 bytes)
Downloading nnAudio-0.3.3-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nnAudio
Successfully installed nnAudio-0.3.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn.model_selection import train_test_split

from tensorflow import keras
from keras.utils import Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from keras.optimizers import RMSprop,Adam
import torch # For deep learning
from nnAudio.Spectrogram import CQT1992v2 # For creating Constant-Q Transform spectrograms
import math
from random import shuffle

import warnings
warnings.filterwarnings("ignore")

In [7]:
# Clase que genera los datos, los randomiza y preprocesa (aplica Q-transform, etc.)
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, data, batch_size):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, 69, 193))
        y = np.zeros((self.batch_size, 1))
        for i, ID in enumerate(list_IDs_temp):
            id_ = self.data.loc[ID, 'id']
            file = id_+'.npy'
            path_in = '/'.join([self.path, id_[0], id_[1], id_[2]])+'/'
            data_array = np.load(path_in+file)
            #np.hstack() apiles arrays in a horizontal way
            waves = np.hstack(data_array)
            #normalize
            waves = waves / np.max(waves)
            #we do a pytorch tensor and convert it into float
            waves = torch.from_numpy(waves).float()
            transform=CQT1992v2(sr=2048, fmin=20, fmax=1024, hop_length=64, verbose = False)
            #Q-transform
            image = transform(waves)
            #We turn it back into a numpy array
            image = np.array(image)
            #transpose axes into (1,2,0)
            image = np.transpose(image,(1,2,0))
            # this creates an array(69, 193, 1) (an image in which the model can identify a signal)
            # and then we create a pack of images corresponding to the batch size
            # so the input_shape of the CNN must be (69, 193).
            X[i, ] = image[:,:,0]
            y[i, ] = self.data.loc[ID, 'target']
        X = np.stack(X)
        y = np.stack(y)
        return X, y

Bayesian Optimization starts here

In [8]:
import pandas as pd
import random
from itertools import cycle


#the train dataset is split in 16 files (0,1,2,...,e,f) and we will use all of this files separately
directorio = cycle(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'])

In [9]:
#this is the Bayesian Optimization library
!pip install scikit-optimize



In [10]:
import skopt
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical

# Definimos las dimensiones de búsqueda. Van a ser un parámetro de entrada imprescindible para gp_minimize
# Defining the dimensions we are studying, it is essential for gp_minimize

#we will study the logarithm of the number of neurons (base 2) and the number of dense layers
dim_log_width = Integer(low=4, high=8, name='log_width')
dim_layers = Integer(low=1, high=5, name='layers')
dimensions = [dim_log_width, dim_layers]

# The reason why I use numbers in between 2^4 and 2^8 in log_width is
# that the model becomes unstable for higher widths, Tomas and I suspect it is 
# because each call only uses a small fragment of the dataset. Maybe if we used
# all the dataset for every call, we would obtain better results, but I don't 
# have enough computational resources for that (it is too expensive).

In [11]:
#These vectors just keep track of the width, layers studied in every step with its accuracy. 
#I have used these values for a plot that shows how gp_minimize explores the hyperparameter space.
log_width_history = []
layers_history = []
acc_history = []


def train(params):
    #every call will use the next file
    directori = next(directorio)
    #This selects the file of the dataset
    train_idx = labels[labels['id'].str.startswith(directori)]['id'].values
    y = labels[labels['id'].isin(train_idx)]['target'].values
    #params is a vector with the dimensions (log_width, layers)
    print(params)
    log_width, layers = params
    log_width_history.append(log_width)
    layers_history.append(layers)

    #Generating the training and validation data
    train_idx, train_Valx = train_test_split(list(labels[labels['id'].str.startswith(directori)].index), test_size=0.2, random_state=2021)
    train_generator = DataGenerator('/kaggle/input/g2net-gravitational-wave-detection/train/', train_idx, labels[labels['id'].str.startswith(directori)], 256)
    val_generator = DataGenerator('/kaggle/input/g2net-gravitational-wave-detection/train/', train_Valx, labels[labels['id'].str.startswith(directori)], 256)
    
    # Building the model
    model = Sequential()
    model.add(Conv1D(2**log_width, input_shape=(69, 193,), kernel_size=3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Flatten())
    for i in range(layers):
        model.add(Dense(2**log_width, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=2e-4), loss='binary_crossentropy', metrics=['accuracy'])

    # Training the model
    history = model.fit(train_generator, validation_data=val_generator, epochs=1)

    # We evaluate the model's performance 
    val_loss, val_accuracy = model.evaluate(val_generator)
    acc_history.append(val_accuracy)
    # It returns the metrics that we want to optimize with gp_minimize
    return -val_accuracy

In [14]:
#Choosing our prior (the first set of hyperparameters studied)
default_parameters = [5, 1]

In [15]:
# EI (expected improcvement) ACQ function is the safest in this context. However, PI (probability of improvement) 
# can also be studied. It is a bit more agressive, but might work correctly as well.
# There are more options for ACQ functions but I am not sure if they will work
# Here we choose the number of times we want gp_minimize to call the training function.
# Every call it will study a set of hyperparameters, and it will not use an already used
# set of training data. 
search_result = gp_minimize(
    func=train,
    dimensions=dimensions,
    acq_func='EI',
    n_calls=16,
    x0=default_parameters)

[8, 1]
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1135s[0m 10s/step - accuracy: 0.5147 - loss: 0.8649 - val_accuracy: 0.5096 - val_loss: 0.6903
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 5s/step - accuracy: 0.5093 - loss: 0.6899
[8, 2]
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1137s[0m 10s/step - accuracy: 0.5259 - loss: 0.7552 - val_accuracy: 0.6574 - val_loss: 0.6809
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 5s/step - accuracy: 0.6511 - loss: 0.6803
[9, 4]
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1171s[0m 10s/step - accuracy: 0.5382 - loss: 0.7081 - val_accuracy: 0.7063 - val_loss: 0.6684
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 5s/step - accuracy: 0.7019 - loss: 0.6685
[8, 1]
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1110s[0m 10s/step - accuracy: 0.5519 - loss: 0.7658 - val_accuracy: 0.5218 - val_loss: 0.6774
[1m28/28[0m [32m━━━━━━━━━━━━━

KeyboardInterrupt: 

In [None]:
best_width = 2**(search_result.x[0])  # the best value of width obtained
best_layers = search_result.x[1]  # the best value of layers obtained
print(search_result.x)
for i in range(len(acc_history)):
    print(log_width_history[i], layers_history[i], acc_history[i])

Active Learning ends here, and a normal training will be done with the best hyperparameters found


In [None]:
sample_submission = pd.read_csv('../input/g2net-gravitational-wave-detection/sample_submission.csv')
train_idx =  labels['id'].values
y = labels['target'].values
test_idx = sample_submission['id'].values

In [None]:
train_idx, train_Valx = train_test_split(list(labels.index), test_size=0.05, random_state=2021)
test_idx = list(sample_submission.index)

In [None]:
train_generator = DataGenerator('/kaggle/input/g2net-gravitational-wave-detection/train/', train_idx, labels, 256)
val_generator = DataGenerator('/kaggle/input/g2net-gravitational-wave-detection/train/', train_Valx, labels, 256)
test_generator = DataGenerator('/kaggle/input/g2net-gravitational-wave-detection/test/', test_idx, sample_submission, 256)

In [None]:
model = Sequential()
model.add(Conv1D(best_width, input_shape=(69, 193,), kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Flatten())
for i in range(best_layers):
    model.add(Dense(best_width, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate=2e-4), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(train_generator, validation_data=val_generator, epochs = 1)

In [None]:
predict = model.predict(test_generator, verbose=1)

In [None]:
sample_submission['target'] = predict[:len(sample_submission)]

In [None]:
sample_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission = pd.read_csv("./submission.csv")
my_submission

In [None]:
import os
print(os.listdir("."))
print(os.listdir("/kaggle/working"))