In [None]:
!pip install gwpy
import gwpy
from gwpy.timeseries import TimeSeries

In [None]:
!pip install librosa

In [None]:
! pip install kaggle

In [5]:
from google.colab import files

In [None]:
files.upload()

In [7]:
! mkdir ~/.kaggle

In [8]:
! cp kaggle.json ~/.kaggle/

In [9]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c g2net-gravitational-wave-detection

In [11]:
!mkdir tfm_g2n

In [None]:
!unzip g2net-gravitational-wave-detection.zip -d tfm_g2n

In [13]:
from google.colab import drive

# Model

## Libraries

In [81]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.metrics import AUC

## Setup variables

In [None]:
train_labels = pd.read_csv("/content/tfm_g2n/training_labels.csv")
train_labels.head()

In [None]:
training_paths = glob("D:/Projects/G2Net-Gravitational-Wave-Detection/data/train/*/*/*/*")
print("The total number of files in the training set:", len(training_paths))

In [None]:
ids = [path.split("\\")[-1].split(".")[0] for path in training_paths]
paths_df = pd.DataFrame({"path":training_paths, "id": ids})
train_data = pd.merge(left=training_labels, right=paths_df, on="id")
train_data.head()

In [None]:
X = train_data['id']
y = train_data['target'].astype('int8').values

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(X, y, random_state = 42, stratify = y)

In [135]:
# Assign the test IDs
sub = pd.read_csv("/content/tfm_g2n/sample_submission.csv")
x_test = sub[['id']]

In [None]:
batch_size = 250

In [None]:
input_shape = (56,193,1)

#### Define the dataset object

In [None]:
# Get the data filepaths as tensor_slices
train_dataset = tf.data.Dataset.from_tensor_slices((x_train.apply(get_npy_filepath).values, y_train))
# shuffle the dataset
train_dataset = train_dataset.shuffle(len(x_train))
# Apply the map method to tf_parse_function()
train_dataset = train_dataset.map(preprocess_function_parse_tf, num_parallel_calls=tf.data.AUTOTUNE)
# set batcg size of the dataset
train_dataset = train_dataset.batch(batch_size)
# Prefetch the data
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
# Get the data filepaths as tensor_slices
valid_dataset = tf.data.Dataset.from_tensor_slices((x_valid.apply(get_npy_filepath).values, y_valid))
# Apply the map method to tf_parse_function()
valid_dataset = valid_dataset.map(preprocess_function_parse_tf, num_parallel_calls=tf.data.AUTOTUNE)
# set batcg size of the dataset
valid_dataset = valid_dataset.batch(batch_size)
# Prefetch the data
valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)

In [149]:
# Get the data filepaths as tensor_slices
test_dataset = tf.data.Dataset.from_tensor_slices((x_test["id"].apply(get_npy_filepath, is_train=False).values))
# Apply the map method to tf_parse_function()
test_dataset = test_dataset.map(preprocess_function_parse_tf, num_parallel_calls=tf.data.AUTOTUNE)
# set batcg size of the dataset
test_dataset = test_dataset.batch(batch_size)
# Prefetch the data
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
# CQT
transform = CQT1992v2(sr=2048,        # sample rate
                fmin=20,        # min freq
                fmax=500,      # max freq
                hop_length=64,  # hop length
                verbose=False)

In [131]:
# let's define some signal parameters
sample_rate = 2048 # data is provided at 2048 Hz
signal_length = 2 # each signal lasts 2 s
fmin, fmax = 20, 1024 # filter above 20 Hz, and max 1024 Hz (Nyquist freq = sample_rate/2)
hop_length = 64 # hop length parameter for the stft

# model compile params
batch_size = 250 # size in which data is processed and trained at-once in model
epochs = 3 # number of epochs (keep low as dataset is quite large 3~5 is enough as observed)

### Functions

In [None]:
# function to load the file, preprocess, return the respective Constant Q-transform
# the Cqt function
# preprocess function
def parse_function(id_path):
    # load the npy file
    signals = np.load(id_path.numpy())
    
    # loop through each signal
    for i in range(signals.shape[0]):
        # normalize the signal data
        signals[i] /= np.max(signals[i])
    
    # stack the arrays into a single vector
    signals = np.hstack(signals)
    
    # convert the signals to torch.tensor to pass to CQT
    signals = torch.from_numpy(signals).float()
    
    # get the CQT
    image = cq_transform(signals)
    
    # conver the image from torch.tensor to array
    image = np.array(image)
    
    # transpose the image to get right orientation
    image = np.transpose(image,(1,2,0))
    
    # conver the image to tf.tensor and return
    return tf.convert_to_tensor(image)

In [None]:
# the tf_function which is called in the data pipeline. This runs as TF function
def tf_parse_function(id_path, y=None):
    # pass the id_path to the py_function parse_function
    [x] = tf.py_function(func=parse_function, inp=[id_path], Tout=[tf.float32])
    
#     x.set_shape(signal_shape) # signal_shape
    x = tf.ensure_shape(x, input_shape)
    
    # if train/valid then return x, y; for test only return x
    if y is None:
        return x
    else:
        return x, y

In [None]:
def get_npy_filepath(id_, is_train=True):
    path = ''
    if is_train:
        return f'/content/tfm_g2n/train/{id_[0]}/{id_[1]}/{id_[2]}/{id_}.npy'
    else:
        return f'/content/tfm_g2n/test/{id_[0]}/{id_[1]}/{id_[2]}/{id_}.npy'

In [124]:
# Function to save kaggle submissions for test prediction probabilities
def get_kaggle_format(prediction_probs, model='base'):
    # load the sample submission file
    sub = pd.read_csv("/content/tfm_g2n/sample_submission.csv")
    sub['target'] = prediction_probs
    
    # Output filename for kaggle submission
    filename = f"kaggle_sub_{model}.csv"
    
    # Save the DataFrame to a file
    sub.to_csv(filename, index=False)
    print(f'File name: {filename}')

# Modelo de https://github.com/PraveenThakkannavar/G2Net-Gravitational-Wave-Detection/blob/main/SIMPLE_CNN.ipynb y delhttps://github.com/SiddharthPatel45/gravitational-wave-detection/blob/main/code/gw-detection-modelling.ipynb

In [109]:
# Instantiate the Sequential model
model_cnn = Sequential(name='CNN_model')

# Add the first Convoluted2D layer w/ input_shape & MaxPooling2D layer followed by that
model_cnn.add(Conv2D(filters=16,
                     kernel_size=3,
                     input_shape=input_shape,
                     activation='relu',
                     name='Conv_01'))
model_cnn.add(MaxPooling2D(pool_size=2, name='Pool_01'))

# Second pair of Conv1D and MaxPooling1D layers
model_cnn.add(Conv2D(filters=32,
                     kernel_size=3,
                     input_shape=input_shape,
                     activation='relu',
                     name='Conv_02'))
model_cnn.add(MaxPooling2D(pool_size=2, name='Pool_02'))

# Third pair of Conv1D and MaxPooling1D layers
model_cnn.add(Conv2D(filters=64,
                     kernel_size=3,
                     input_shape=input_shape,
                     activation='relu',
                     name='Conv_03'))
model_cnn.add(MaxPooling2D(pool_size=2, name='Pool_03'))

# Add the Flatten layer
model_cnn.add(Flatten(name='Flatten'))

# Add the Dense layers
model_cnn.add(Dense(units=512,
                activation='relu',
                name='Dense_01'))
model_cnn.add(Dense(units=64,
                activation='relu',
                name='Dense_02'))

# Add the final Output layer
model_cnn.add(Dense(1, activation='sigmoid', name='Output'))

In [110]:
model_cnn.summary()

Model: "CNN_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Conv_01 (Conv2D)            (None, 54, 191, 16)       160       
                                                                 
 Pool_01 (MaxPooling2D)      (None, 27, 95, 16)        0         
                                                                 
 Conv_02 (Conv2D)            (None, 25, 93, 32)        4640      
                                                                 
 Pool_02 (MaxPooling2D)      (None, 12, 46, 32)        0         
                                                                 
 Conv_03 (Conv2D)            (None, 10, 44, 64)        18496     
                                                                 
 Pool_03 (MaxPooling2D)      (None, 5, 22, 64)         0         
                                                                 
 Flatten (Flatten)           (None, 7040)              0 

In [111]:
model_cnn.compile(optimizer=Adam(learning_rate=0.0001),
                  loss='binary_crossentropy',
                  metrics=[[AUC(), 'accuracy']])

In [121]:
# Fit the data
history_cnn = model_cnn.fit(x=train_dataset,
                            epochs=3,
                            validation_data=valid_dataset,
                            batch_size=250,
                            verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [138]:
# re-train the model on remaining validation data
model_cnn.fit(x=valid_dataset, epochs=epochs, batch_size=batch_size, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f669692b950>

In [150]:
# predict the test dataset using CNN
preds_cnn = model_cnn.predict(test_dataset)

In [151]:
#save the kaggle submission file
get_kaggle_format(preds_cnn, model='cnn')

File name: kaggle_sub_cnn.csv


In [152]:
# save the model
model_cnn.save('/content/tfm_g2n/model_CNN.h5')

In [None]:
# load the predictions into a dataframe
df_preds_cnn = pd.read_csv('/content/tfm_g2n/kaggle_sub_cnn.csv')
df_preds_cnn.head()

In [None]:
df_preds_cnn.shape

In [None]:
df_preds_cnn[(df_preds_cnn['target'] >= 0.9) | (df_preds_cnn['target'] <= 0.1)]['target'].count()

In [None]:
df_preds_cnn[(df_preds_cnn['target'] >= 0.8) | (df_preds_cnn['target'] <= 0.2)]['target'].count(