# Model

## Libraries

In [None]:
# ********** FOR GOOGLE DRIVE AND COLAB *****************

import os 
from google.colab import drive
drive.mount('/content/gdrive')


!python -m pip install gwpy
!pip install --upgrade --force-reinstall --no-deps gwpy
!pip install astropy
!pip install nnAudio
!pip install colorama

!pip install --upgrade --force-reinstall --no-deps matplotlib

!pip install --force-reinstall --no-deps matplotlib==3.2.2
# For running in Colab I have to have a previous version of matplotlib
# This for Gihut Issue > https://github.com/gwpy/gwpy/issues/1398
# More details are in my note in previous cell

!pip install gwosc
!pip install dqsegdb2
!pip install ligotimegps

In [None]:
import pandas as pd
import seaborn as sns
from scipy import signal
from gwpy.timeseries import TimeSeries
from gwpy.plot import Plot
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from PIL import Image
from glob import glob
from matplotlib import pyplot as plt
import random
from colorama import Fore, Back, Style
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import Sequence

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPool1D, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, Adam

import torch
from nnAudio.Spectrogram import CQT1992v2


from src.model.model import *

## Setup variables

In [None]:
data = pd.read_csv("data/data_path.csv")

In [None]:
# Checking the contents of one file



# LOCAL ROOT DIRECTOR
root_dir = "C:/Users/e107338/PycharmProjects/G2Net-Gravitational-Wave-Detection/data"
file = root_dir + 'train/0/0/0/000a5b6e5c.npy'
data = np.load(file)
print(data.shape)
print(data)
# print(data[0, :].shape)
# print(data[1, :].shape)
# print(data[2, :].shape)
print("data[0, :] is ", data[0, :])
# data_1

Load the .npy files from all the nested folder-structure and get the ids from file names

In [None]:
root_dir = "D:/Projects/G2Net-Gravitational-Wave-Detection/data"
train_labels = pd.read_csv(root_dir + "/training_labels.csv")
print('Dataset has ' + "{} Observations".format(train_labels.shape[0]) )

In [None]:
# Construct a Training dataframe for all the available .npy files 

# Get all the file file path from all 4-labels of nested folder structure
files_paths = glob(root_dir + '/train/*/*/*/*')
''' The glob module finds all the pathnames matching a specified pattern according to the rules 
used by the Unix shell, although results are returned in arbitrary order. 
No tilde expansion is done, but *, ?, and character ranges expressed with [] will be correctly matched. 
We can use glob to search for a specific file pattern, or perhaps more usefully, search for files where the 
filename matches a certain pattern by using wildcard characters.

'''

# get the list of ids from the .npy files
ids_from_npy_files = [path.split("/")[-1].split(".")[0] for path in files_paths]
# [-1] means the last element in a sequence,

# get a dataframe with paths and ids of those .npy files
df_path_id = pd.DataFrame({'path': files_paths, 'id':ids_from_npy_files})
df_path_id.head()

# merging that above df with the target
df_train = pd.merge(left=train_labels, right=df_path_id, on='id')
display(df_train.head())

# verifying the shape of the merged df has 5,60,000 rows and 3 columns
df_train.shape

In [None]:
sample_submission = pd.read_csv(root_dir +  'sample_submission.csv')
print(len(train_labels)) # 5,60,000
print(len(sample_submission)) # 2,260,000
train_ids = train_labels['id'].values
# train_ids # ['00000e74ad', '00001f4945', '0000661522' ... ]
y = train_labels['target'].values
test_ids = sample_submission['id'].values

In [None]:
# train_labels = pd.read_csv(root_dir + "training_labels.csv", nrows=1000)

# ********************

# Now I shall genereate train indices, validation indices and test indices
# Which are just the values from the 0-based indices
train_indices, validation_indices = train_test_split(list(train_labels.index), test_size=0.33, random_state=2021)
print(len(train_indices))
print(len(validation_indices))
test_indices = list(sample_submission.index)
test_indices

In [None]:
train_generator_for_seq_model = DataGenerator( root_dir +  'train/', train_indices, train_labels, 64)
# print(train_generator_for_seq_model)
validation_generator_for_seq_model = DataGenerator( root_dir + 'train/', validation_indices, train_labels, 64)
test_generator_for_seq_model = DataGenerator( root_dir + 'test/', test_indices, sample_submission, 64)

In [None]:
model_keras_seq = Sequential()
model_keras_seq.add(Conv1D(64, input_shape=(3, 4096), kernel_size=3, activation='relu'))
model_keras_seq.add(BatchNormalization())
model_keras_seq.add(Flatten())
model_keras_seq.add(Dense(64, activation='relu'))
model_keras_seq.add(Dense(1, activation='sigmoid'))

model_keras_seq.compile(optimizer= Adam(lr=2e-4), loss='binary_crossentropy', metrics=['acc'])
model_keras_seq.summary()

In [None]:

history = model_keras_seq.fit_generator(generator=train_generator_for_seq_model, validation_data=validation_generator_for_seq_model, epochs = 1, workers=-1)
# Running for 1 epoch took almost 2 and half hours.

predicted_test_seq_keras = model_keras_seq.predict_generator(test_generator_for_seq_model, verbose=1)

sample_submission['target'] = predicted_test_seq_keras[:len(sample_submission)]

sample_submission.to_csv('submission.csv', index=False)