In [None]:
https://github.com/rohan-paul/Gravitational-Wave-Detection_Kaggle_Competition/blob/main/Kaggle_NBs/1_TimeSeries_GWPy_Data_Preprocessing.ipynb

In [2]:
conda install -c conda-forge gwpy

ERROR: Could not find a version that satisfies the requirement gwpy (from versions: none)
ERROR: No matching distribution found for gwpy


## Libraries

In [4]:
import pandas as pd
import seaborn as sns
from scipy import signal
from gwpy.timeseries import TimeSeries
from gwpy.plot import Plot
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from PIL import Image
from glob import glob
from matplotlib import pyplot as plt
import random
from colorama import Fore, Back, Style
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import Sequence

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, Adam

import torch
from nnAudio.Spectrogram import CQT1992v2

ModuleNotFoundError: No module named 'gwpy'

## Load data

In [None]:
train_labels = pd.read_csv("data/training_labels.csv")
train_labels.head()

In [None]:
# Get all the file file path from all 4-labels of nested folder structure
files_paths = glob(root_dir + '/train/*/*/*/*')
''' The glob module finds all the pathnames matching a specified pattern according to the rules 
used by the Unix shell, although results are returned in arbitrary order. 
No tilde expansion is done, but *, ?, and character ranges expressed with [] will be correctly matched. 
We can use glob to search for a specific file pattern, or perhaps more usefully, search for files where the 
filename matches a certain pattern by using wildcard characters.

'''

# get the list of ids from the .npy files
ids_from_npy_files = [path.split("/")[-1].split(".")[0] for path in files_paths]
# [-1] means the last element in a sequence,
# print(ids_from_npy_files)

# get a dataframe with paths and ids of those .npy files
df_path_id = pd.DataFrame({'path': files_paths, 'id':ids_from_npy_files})
df_path_id.head()

# merging that above df with the target
df_train = pd.merge(left=train_labels, right=df_path_id, on='id')
display(df_train.head())

# verifying the shape of the merged df has 5,60,000 rows and 3 columns
df_train.shape

In [None]:
# Classify the the 2 classes of targets of 1 and 0
target_1_df_train = df_train[df_train.target == 1]
target_0_df_train = df_train[df_train.target == 0]
print("Class distribution of Target: \n ", train_labels.target.value_counts())
display(target_1_df_train.head())

In [None]:
sns.countplot(x = 'target' , data=train_labels)
plt.title('Target Class Distribution')

In [None]:
""" First, we define the constructor to initialize the configuration of the generator.
Note that here, we assume the path to the data is in a dataframe column.

"""

class DataGenerator(Sequence):

    # For this dataset the list_IDs are the value of the ids
    # for each of the time-series file
    # i.e. for Train data => values of column 'id' from training_labels.csv

    # Also Note we have earlier defined our labels to be the below
    # labels = pd.read_csv(root_dir + "training_labels.csv")
    # and the argument "data" is that label here.
    def __init__(self, path, list_IDs, data, batch_size):
        self.path = path
        self.list_IDs = list_IDs
        self.data = data
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.list_IDs))

    """ __len__ essentially returns the number of steps in an epoch, using the samples and the batch size.
        Each call requests a batch index between 0 and the total number of batches, where the latter is specified in the __len__ method.
        A common practice is to set this value to (samples / batch size)
        so that the model sees the training samples at most once per epoch.
        Now, when the batch corresponding to a given index is called, the generator executes the __getitem__ method to generate it.
    """

    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_ * self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_

    """  __getitem__ method is called with the batch number as an argument to obtain a given batch of data.

    """
    def __getitem__(self, index):
        # get the range to to feed to keras for each epoch
        # incrementing by +1 the bath_size
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y

    """ And finally the core method which will actually produce batches of data. This private method __data_generation """

    def __data_generation(self, list_IDs_temp):
        # We have 5,60,000 files, each with dimension of 3 * 4096
        X = np.zeros((self.batch_size, 3, 4096))
        y = np.zeros((self.batch_size, 1))
        for i, ID in enumerate(list_IDs_temp):
            id_ = self.data.loc[ID, "id"]
            file = id_ + ".npy"  # build the file name
            path_in = "/".join([self.path, id_[0], id_[1], id_[2]]) + "/"
            # there are three nesting labels inside train/ or test/
            data_array = np.load(path_in + file)            
            data_array = (data_array - data_array.mean())/data_array.std()
            X[i, ] = data_array
            y[i, ] = self.data.loc[ID, 'target']
        # print(X)
        return X, y

In [None]:
sample_submission = pd.read_csv(root_dir +  'sample_submission.csv')
# print(len(train_labels)) # 5,60,000
# print(len(sample_submission)) # 2,26,000
train_ids = train_labels['id'].values
# train_ids # ['00000e74ad', '00001f4945', '0000661522' ... ]
y = train_labels['target'].values
test_ids = sample_submission['id'].values

In [None]:
# train_labels = pd.read_csv(root_dir + "training_labels.csv", nrows=1000)

# ********************

# Now I shall genereate train indices, validation indices and test indices
# Which are just the values from the 0-based indices
train_indices, validation_indices = train_test_split(list(train_labels.index), test_size=0.33, random_state=2021)
# print(len(train_indices))
print(len(validation_indices))
test_indices = list(sample_submission.index)
# test_indices

In [None]:
train_generator_for_seq_model = DataGenerator( root_dir +  'train/', train_indices, train_labels, 64)
# print(train_generator_for_seq_model)
validation_generator_for_seq_model = DataGenerator( root_dir + 'train/', validation_indices, train_labels, 64)
test_generator_for_seq_model = DataGenerator( root_dir + 'test/', test_indices, sample_submission, 64)

https://github.com/PraveenThakkannavar/G2Net-Gravitational-Wave-Detection/blob/main/SIMPLE_CNN.ipynb

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.metrics import AUC

import librosa.display
import torch

# this is used for Contant Q Transform
from nnAudio.Spectrogram import CQT1992v2
from tensorflow.keras.applications import EfficientNetB0 as efn