In [None]:
!pip install gwpy
import gwpy
from gwpy.timeseries import TimeSeries

In [None]:
!pip install librosa

In [None]:
! pip install kaggle

In [5]:
from google.colab import files

In [None]:
files.upload()

In [7]:
! mkdir ~/.kaggle

In [8]:
! cp kaggle.json ~/.kaggle/

In [9]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c g2net-gravitational-wave-detection

In [11]:
!mkdir tfm_g2n

In [None]:
!unzip g2net-gravitational-wave-detection.zip -d tfm_g2n

In [13]:
from google.colab import drive

In [14]:
from gwpy.timeseries import TimeSeries # time domain data array in gwpy
from gwpy.plot import Plot # plotting in gwpy
from scipy.signal import hann
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import interp1d  # interpolating a 1-D function
import matplotlib.mlab as mlab  # some MATLAB commands
from glob import glob     # pathname management

import librosa
import librosa.display

In [15]:
train_labels = pd.read_csv("/content/tfm_g2n/training_labels.csv")
train_labels.head()

Unnamed: 0,id,target
0,00000e74ad,1
1,00001f4945,0
2,0000661522,0
3,00007a006a,0
4,0000a38978,1


In [16]:
training_paths = glob("/content/tfm_g2n/train/*/*/*/*")
print("The total number of files in the training set:", len(training_paths))

The total number of files in the training set: 560000


In [17]:
ids = [path.split("/")[-1].split(".")[0] for path in training_paths]
paths_df = pd.DataFrame({"path":training_paths, "id": ids})
train_data = pd.merge(left=train_labels, right=paths_df, on="id")
train_data.head()

Unnamed: 0,id,target,path
0,00000e74ad,1,/content/tfm_g2n/train/0/0/0/00000e74ad.npy
1,00001f4945,0,/content/tfm_g2n/train/0/0/0/00001f4945.npy
2,0000661522,0,/content/tfm_g2n/train/0/0/0/0000661522.npy
3,00007a006a,0,/content/tfm_g2n/train/0/0/0/00007a006a.npy
4,0000a38978,1,/content/tfm_g2n/train/0/0/0/0000a38978.npy


# Model

## Libraries

In [23]:
# Import tensorflow and sklearn
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, Adam
# To set learning rate
from tensorflow.keras.callbacks import LearningRateScheduler

## Setup variables

In [None]:
train_data = pd.read_csv("data/data_path.csv")
train_data.head()

In [None]:
training_paths = glob("D:/Projects/G2Net-Gravitational-Wave-Detection/data/train/*/*/*/*")
print("The total number of files in the training set:", len(training_paths))

In [None]:
ids = [path.split("\\")[-1].split(".")[0] for path in training_paths]
paths_df = pd.DataFrame({"path":training_paths, "id": ids})
train_data = pd.merge(left=training_labels, right=paths_df, on="id")
train_data.head()

# Modelo de https://github.com/Rtavakol/Kaggle_G2Net-Gravitational-Wave-Detection/blob/main/Gravitational_wave.ipynb

In [None]:
# Make a simple sequential model with one conv layers
model = Sequential()

# step 1: 1st Convlution layer
model.add(Conv1D(128, kernel_size = 3,activation='relu', input_shape=(3, 4096)))

# step 2: Flattening
model.add(Flatten())

# step 3: Full connection 
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(8, activation='relu'))
# We have a binary classification, so the number of nodes would be 1
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Model summary
model.summary()

In [None]:
# Lets train our model using only 10000 time series, 
# eventually we need to use a data generator as we run out of memory if we want to 
# use all training and test datasets.
N = 1000
train_x  = np.zeros((N, 3, 4096))
for i in range(N):
    data = np.load(training_paths[i])
    mean = np.mean(data, axis=1)
    std = np.std(data, axis = 1)
    data_m = [(data[i] - mean[i])/std[i] for i in range(3)]
    train_x[i,:] = data_m

In [None]:
np.shape(train_x)

In [None]:
train_x[:2]

In [None]:
train_y = training_labels.iloc[:N, 1].values
print(len(train_y))
train_y[:10]

In [None]:
train_x_reshaped = train_x.reshape(-1,3, 4096)
np.shape(train_x_reshaped)

In [None]:
train_y_reshaped = train_y.reshape(-1, 1)
train_y_reshaped.shape

In [None]:
# Create Keras Callbacks for learning rate
my_callbacks_lr = [LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x, verbose=0)]

In [None]:
# Fitting CNN to training dataset
result = model.fit(x = train_x_reshaped,
              y = train_y_reshaped,
              epochs = 20,
              batch_size= 32, 
              verbose= 1, 
              callbacks= my_callbacks_lr,
              validation_split= 0.2,
              shuffle= True)

In [None]:
%matplotlib notebook
plt.plot(result.history['accuracy'], label = 'Accuracy')
plt.plot(result.history['val_accuracy'], label = 'Validation Accuracy')
plt.xlabel('epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.savefig('Accuracy.png', dpi = 500)

In [None]:
# now lets do some experiment with this limitted 10000 samples
# first experiment on number of filters
n = 4 # number of try
model = [0] * n
filter_number = [64*(i + 1) for i in range(4)]
for i, f in zip(range(N), filter_number):
    # Make a simple sequential model with one conv layers
    model[i] = Sequential()

    # step 1: 1st Convlution layer
    model[i].add(Conv1D(f, kernel_size = 3,activation='relu', input_shape=(3, 4096)))

    # step 2: Flattening
    model[i].add(Flatten())

    # step 3: Full connection 
    model[i].add(Dense(64, activation='relu'))
    # We have a binary classification, so the number of nodes would be 1
    model[i].add(Dense(1, activation='sigmoid'))

    # Compile the model
    model[i].compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    # Model summary
    model[i].summary()
    

In [None]:
result = [0] * N
for i in range(4): 
    # Fitting CNN to training dataset
    result[i] = model[i].fit(x = train_x_reshaped,
              y = train_y_reshaped,
              epochs = 20,
              batch_size= 32, 
              verbose= 1, 
              callbacks= my_callbacks_lr,
              validation_split= 0.2,
              shuffle= True)

In [None]:
%matplotlib notebook
for i in range(n):
    plt.plot(result[i].history['accuracy'], label = 'Model: {}, acc'.format(i))
    plt.plot(result[i].history['val_accuracy'], label = 'Model: {}, val_acc'.format(i))
plt.xlabel('epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.tight_layout()
plt.savefig('models_Accuracy.png', dpi = 500)

In [None]:
# We see overfitting for 128 and 256
# Let go with 64 filters and now do ecperiment on dense layer

In [None]:
model_keras_seq = Sequential()
model_keras_seq.add(Conv1D(64, input_shape=(3, 4096), kernel_size=3, activation='relu'))
model_keras_seq.add(BatchNormalization())
model_keras_seq.add(Flatten())
model_keras_seq.add(Dense(64, activation='relu'))
model_keras_seq.add(Dense(1, activation='sigmoid'))

model_keras_seq.compile(optimizer= Adam(lr=2e-4), loss='binary_crossentropy', metrics=['acc'])
model_keras_seq.summary()

In [None]:
# To feed all training data we should define a data generator as the data size is very large and our memory can
# not handle it. So, we use a data generator to feed our model batch by batch 
# in a real time mode instead of a passive mode
class data_generator(Sequence):
    
    def __init__(self, path, list_IDs, data, batch_size):
        self.list_IDs = list_IDs
        self.data = data
        self.path = path
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_ * self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    def __getitem__(self, index):
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
    
    def _data_generator__data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, 3, 4096))
        y = np.zeros((self.batch_size, 1))
        for i, ID in enumerate(list_IDs_temp):
            id_ = self.data.loc[ID, 'id']
            file = id_ + '.npy'
            path_in = '/'.join([self.path, id_[0], id_[1], id_[2]]) + '/'
            data_array = np.load(path_in + file)
            data_array = (data_array - data_array.mean())/data_array.std()
            X[i, ] = data_array
            y[i, ] = self.data.loc[ID, 'target']
        return X,y

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

In [None]:
test_ids = sample_submission['id'].values
test_indices = list(sample_submission.index)

train_ids = training_labels['id'].values
train_y = training_labels.iloc[:, 1].values

In [None]:
train_indices, validation_indices = train_test_split(list(training_labels.index), test_size=0.3, random_state=1)

In [None]:
root_dir = '../Gravitational_Wave_data/train_extracted/'

In [None]:
train_generator = data_generator(root_dir, train_indices, training_labels, 64)

In [None]:
test_generator  = data_generator(root_dir, test_indices, training_labels, 64)

In [None]:
validation_generator = data_generator(root_dir, validation_indices, sample_submission, 64)

In [None]:

history = model.fit_generator(generator=train_generator, validation_data=validation_generator, epochs=1, workers=-1)
test_prediction = model.predict_generator(test_generator, verbose=1)

In [None]:
train_ids

In [None]:
training_files[1]

In [None]:
train_indices

In [None]:
sample_submission['target'] = test_prediction[:len(sample_submission)]
sample_submission.to_csv('submission.csv', index=False)

# Modelo de https://github.com/rohan-paul/Gravitational-Wave-Detection_Kaggle_Competition/blob/main/Kaggle_NBs/1_TimeSeries_GWPy_Data_Preprocessing.ipynb

In [None]:
# ********** FOR GOOGLE DRIVE AND COLAB *****************

import os 
from google.colab import drive
drive.mount('/content/gdrive')


!python -m pip install gwpy
!pip install --upgrade --force-reinstall --no-deps gwpy
!pip install astropy
!pip install nnAudio
!pip install colorama

!pip install --upgrade --force-reinstall --no-deps matplotlib

!pip install --force-reinstall --no-deps matplotlib==3.2.2
# For running in Colab I have to have a previous version of matplotlib
# This for Gihut Issue > https://github.com/gwpy/gwpy/issues/1398
# More details are in my note in previous cell

!pip install gwosc
!pip install dqsegdb2
!pip install ligotimegps
import pandas as pd
import seaborn as sns
from scipy import signal
from gwpy.timeseries import TimeSeries
from gwpy.plot import Plot
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from PIL import Image
from glob import glob
from matplotlib import pyplot as plt
import random
from colorama import Fore, Back, Style
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import Sequence

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, Adam

import torch
from nnAudio.Spectrogram import CQT1992v2




In [19]:
""" First, we define the constructor to initialize the configuration of the generator.
Note that here, we assume the path to the data is in a dataframe column.

"""

class DataGenerator(Sequence):

    # For this dataset the list_IDs are the value of the ids
    # for each of the time-series file
    # i.e. for Train data => values of column 'id' from training_labels.csv

    # Also Note we have earlier defined our labels to be the below
    # labels = pd.read_csv(root_dir + "training_labels.csv")
    # and the argument "data" is that label here.
    def __init__(self, path, list_IDs, data, batch_size):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))

    """ __len__ essentially returns the number of steps in an epoch, using the samples and the batch size.
        Each call requests a batch index between 0 and the total number of batches, where the latter is specified in the __len__ method.
        A common practice is to set this value to (samples / batch size)
        so that the model sees the training samples at most once per epoch.
        Now, when the batch corresponding to a given index is called, the generator executes the __getitem__ method to generate it.
    """

    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_ * self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_

    """  __getitem__ method is called with the batch number as an argument to obtain a given batch of data.

    """
    def __getitem__(self, index):
        # get the range to to feed to keras for each epoch
        # incrementing by +1 the bath_size
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    """ And finally the core method which will actually produce batches of data. This private method __data_generation """

    def __data_generation(self, list_IDs_temp):
        # We have 5,60,000 files, each with dimension of 3 * 4096
        X = np.zeros((self.batch_size, 3, 4096))
        y = np.zeros((self.batch_size, 1))
        for i, ID in enumerate(list_IDs_temp):
            id_ = self.data.loc[ID, "id"]
            file = id_ + ".npy"  # build the file name
            path_in = "/".join([self.path, id_[0], id_[1], id_[2]]) + "/"
            # there are three nesting labels inside train/ or test/
            data_array = np.load(path_in + file)            
            data_array = (data_array - data_array.mean())/data_array.std()
            X[i, ] = data_array
            y[i, ] = self.data.loc[ID, 'target']
        # print(X)
        return X, y

In [20]:
sample_submission = pd.read_csv('/content/tfm_g2n/sample_submission.csv')
# print(len(train_labels)) # 5,60,000
# print(len(sample_submission)) # 2,26,000
train_ids = train_labels['id'].values
# train_ids # ['00000e74ad', '00001f4945', '0000661522' ... ]
y = train_labels['target'].values
test_ids = sample_submission['id'].values

In [21]:
# train_labels = pd.read_csv(root_dir + "training_labels.csv", nrows=1000)

# ********************

# Now I shall genereate train indices, validation indices and test indices
# Which are just the values from the 0-based indices
train_indices, validation_indices = train_test_split(list(train_labels.index), test_size=0.33, random_state=2021)
# print(len(train_indices))
print(len(validation_indices))
test_indices = list(sample_submission.index)
# test_indices

184800


In [32]:
train_generator_for_seq_model = DataGenerator( '/content/tfm_g2n/train', train_indices, train_labels, 64)
# print(train_generator_for_seq_model)

validation_generator_for_seq_model = DataGenerator( '/content/tfm_g2n/train', validation_indices, train_labels, 64)
test_generator_for_seq_model = DataGenerator( '/content/tfm_g2nd/test', test_indices, sample_submission, 64)

In [33]:
model_keras_seq = Sequential()
model_keras_seq.add(Conv1D(64, input_shape=(3, 4096), kernel_size=3, activation='relu'))
model_keras_seq.add(BatchNormalization())
model_keras_seq.add(Flatten())
model_keras_seq.add(Dense(64, activation='relu'))
model_keras_seq.add(Dense(1, activation='sigmoid'))

model_keras_seq.compile(optimizer= Adam(lr=2e-4), loss='binary_crossentropy', metrics=['acc'])
model_keras_seq.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_1 (Conv1D)           (None, 1, 64)             786496    
                                                                 
 batch_normalization_1 (Batc  (None, 1, 64)            256       
 hNormalization)                                                 
                                                                 
 flatten_1 (Flatten)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 790,977
Trainable params: 790,849
Non-trainable params: 128
______________________________________________

In [None]:
history = model_keras_seq.fit_generator(generator=train_generator_for_seq_model, validation_data=validation_generator_for_seq_model, epochs = 1, workers=-1)
# Running for 1 epoch took almost 2 and half hours.

predicted_test_seq_keras = model_keras_seq.predict_generator(test_generator_for_seq_model, verbose=1)

sample_submission['target'] = predicted_test_seq_keras[:len(sample_submission)]

sample_submission.to_csv('submission.csv', index=False)

 362/5863 [>.............................] - ETA: 19:13 - loss: 0.7246 - acc: 0.4904

# Modelo de https://github.com/PraveenThakkannavar/G2Net-Gravitational-Wave-Detection/blob/main/SIMPLE_CNN.ipynb

In [None]:
# Instantiate the Sequential model
model_cnn = Sequential(name='CNN_model')

# Add the first Convoluted2D layer w/ input_shape & MaxPooling2D layer followed by that
model_cnn.add(Conv2D(filters=16,
                     kernel_size=3,
                     input_shape=input_shape,
                     activation='relu',
                     name='Conv_01'))
model_cnn.add(MaxPooling2D(pool_size=2, name='Pool_01'))

# Second pair of Conv1D and MaxPooling1D layers
model_cnn.add(Conv2D(filters=32,
                     kernel_size=3,
                     input_shape=input_shape,
                     activation='relu',
                     name='Conv_02'))
model_cnn.add(MaxPooling2D(pool_size=2, name='Pool_02'))

# Third pair of Conv1D and MaxPooling1D layers
model_cnn.add(Conv2D(filters=64,
                     kernel_size=3,
                     input_shape=input_shape,
                     activation='relu',
                     name='Conv_03'))
model_cnn.add(MaxPooling2D(pool_size=2, name='Pool_03'))

# Add the Flatten layer
model_cnn.add(Flatten(name='Flatten'))

# Add the Dense layers
model_cnn.add(Dense(units=512,
                activation='relu',
                name='Dense_01'))
model_cnn.add(Dense(units=64,
                activation='relu',
                name='Dense_02'))

# Add the final Output layer
model_cnn.add(Dense(1, activation='sigmoid', name='Output'))

In [None]:
model_cnn.summary()

In [None]:
model_cnn.compile(optimizer=Adam(learning_rate=0.0001),
                  loss='binary_crossentropy',
                  metrics=[[AUC(), 'accuracy']])

In [None]:
# Fit the data
history_cnn = model_cnn.fit(x=train_dataset,
                            epochs=3,
                            validation_data=valid_dataset,
                            batch_size=batch_size,
                            verbose=1)