# In this notebook we try to use WaveNet to generate Blip Glitches from H1 run O2

Mount drive and clone the pytorch-wavenet repository

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content
!git clone https://github.com/vincentherrmann/pytorch-wavenet.git
%cd /content/pytorch-wavenet
!git pull

Mounted at /content/drive
/content
Cloning into 'pytorch-wavenet'...
remote: Enumerating objects: 1154, done.[K
remote: Total 1154 (delta 0), reused 0 (delta 0), pack-reused 1154[K
Receiving objects: 100% (1154/1154), 268.94 MiB | 27.99 MiB/s, done.
Resolving deltas: 100% (713/713), done.
/content/pytorch-wavenet
Already up to date.


In [3]:
!pip install gwpy

Collecting gwpy
  Downloading gwpy-3.0.5-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting dqsegdb2 (from gwpy)
  Downloading dqsegdb2-1.2.1-py3-none-any.whl (25 kB)
Collecting gwdatafind>=1.1.0 (from gwpy)
  Downloading gwdatafind-1.1.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gwosc>=0.5.3 (from gwpy)
  Downloading gwosc-0.7.1-py3-none-any.whl (27 kB)
Collecting ligo-segments>=1.0.0 (from gwpy)
  Downloading ligo-segments-1.4.0.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ligotimegps>=1.2.1 (from gwpy)
  Downloading ligotimegps-2.0.1-py2.py3-none-any.whl (19 kB)
Collecting igwn-auth-utils>=0.3.1 (from gwda

In [4]:
!unzip -q /content/drive/MyDrive/H1_O2_Blip_Data/H1_O2_Blip_0-1000.zip
!unzip -q /content/drive/MyDrive/H1_O2_Blip_Data/H1_O2_Blip_1000-2000.zip
!unzip -q /content/drive/MyDrive/H1_O2_Blip_Data/H1_O2_Blip_2000-3048.zip
!unzip -q /content/drive/MyDrive/H1_O2_Blip_Data/H1_O2_Blip_3048-5000.zip
!unzip -q /content/drive/MyDrive/H1_O2_Blip_Data/H1_O2_Blip_5000-7000.zip
!unzip -q /content/drive/MyDrive/H1_O2_Blip_Data/H1_O2_Blip_7000-9000.zip

In [5]:
!unzip -q /content/drive/MyDrive/Gravity_Spy_Glitches_whitened_1.zip
!unzip -q /content/drive/MyDrive/Gravity_Spy_Glitches_whitened_2.zip
!unzip -q /content/drive/MyDrive/Gravity_Spy_Glitches_whitened_3.zip
!unzip -q /content/drive/MyDrive/Gravity_Spy_Glitches_whitened_4.zip
!unzip -q /content/drive/MyDrive/Gravity_Spy_Glitches_whitened_5.zip
!unzip -q /content/drive/MyDrive/Gravity_Spy_Glitches_whitened_6.zip
!unzip -q /content/drive/MyDrive/Gravity_Spy_Glitches_whitened_7.zip

In [6]:
import os
import h5py as h5
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from torchsummary import summary
import torchvision
from sklearn.model_selection import train_test_split

from gwpy.timeseries import TimeSeries

Dataset Utility

In [7]:
def construct_dataframe(path,t_delta=None):
  name_data=[]
  starting_times_data=[]
  timeseries_data=[]
  type_data=[]
  for file in listdir(path):
    if file != '.ipynb_checkpoints':
        fout=h5.File(path+file,'r')
    type_of_glitch=list(fout.keys())[0]
    dataset=fout[type_of_glitch]
    t = TimeSeries(dataset[()])
    ln=len(t)/2
    #if no t_delta is provided, the whole the timeseries are not cut
    if t_delta==None:
      t_delta=ln
    name_data.append(file.split('.')[0].split('_')[0])
    type_data.append(type_of_glitch)
    starting_times_data.append(dataset.attrs['t0'])
    dataset_cut=dataset[()][int(ln-t_delta):int(ln+t_delta)]
    timeseries_data.append(dataset_cut)
    # intialise data of lists.

  data = {'ID':name_data,
        'Type_Of_Glitch':type_data,
        'Starting_Times':starting_times_data,
        'TimeSeries':timeseries_data}
    # Create DataFrame
  df= pd.DataFrame(data)
  return df

def check_remove_nanDF(dataframe, col='TimeSeries'):
    val=dataframe[col].values
    idx=[]
    for i in range(len(val)):
        if np.isnan(val[i])[0]==True:
            idx+=[i]
    print("removed timeseries:", len(idx))
    print(idx)
    return dataframe.drop(labels=idx,axis=0)


def build_torchDataset(Dataset, norm=True):
    X,y=Dataset.T
    data=[]
    label=[]
    for element in X:
        data+= [element]
    for i in y:
        label+=[i]
    X_t=torch.tensor(np.asarray(data)).unsqueeze(1)
    y_t=torch.tensor(label).long()
    if norm:
        X_t=(X_t-X_t.mean(0))/X_t.std(0)
    return TensorDataset(X_t,y_t)



def build_dataset(path='/content/pytorch-wavenet/content/sample_data/', idx=['_2', '_3', '_4', '_5', '_6', '_7']):
    df_LIGO=construct_dataframe(path+'Gravity_Spy/')
    df_LIGO=check_remove_nanDF(df_LIGO)
    for i in idx:
        df_tmp=construct_dataframe(path+'Gravity_Spy'+i+'/')
        df_tmp=check_remove_nanDF(df_tmp)
        df_LIGO=pd.concat([df_LIGO,df_tmp],axis=0)
    return df_LIGO

def pytorch_glitch(df,norm=True):
    df.Type_Of_Glitch = pd.Categorical(df.Type_Of_Glitch)
    df['code']=df.Type_Of_Glitch.cat.codes
    return build_torchDataset(df.drop(columns=['ID', 'Type_Of_Glitch', 'Starting_Times']).values,norm)

def get_labels(df):
    return  sorted(list(set(datapoint[1] for datapoint in df.to_numpy())))

def remove_row_by_freq(df, col='Type_Of_Glitch', freq=50):
    return df[df.groupby(col)[col].transform('count').ge(freq)]


def split_stratified_into_train_val_test(df_input, stratify_colname='Type_Of_Glitch',
                                         frac_train=0.75, frac_val=0.15, frac_test=0.10,
                                         random_state=137):
    '''
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    '''

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError('%s is not a column in the dataframe' % (stratify_colname))

    X = df_input # Contains all columns.
    y = df_input[[stratify_colname]] # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(X,
                                                          y,
                                                          stratify=y,
                                                          test_size=(1.0 - frac_train),
                                                          random_state=random_state)

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(df_temp,
                                                      y_temp,
                                                      stratify=y_temp,
                                                      test_size=relative_frac_test,
                                                      random_state=random_state)

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test

In [None]:
construct_dataframe('/content/pytorch-wavenet/content/sample_data/Gravity_Spy/')

Unnamed: 0,ID,Type_Of_Glitch,Starting_Times,TimeSeries
0,sV3eoXQPMc,Blip,1.128337e+09,"[-4.3496227, 3.2480447, -4.878654, 7.2891717, ..."
1,W7oI3wLRz6,Repeating_Blips,1.136393e+09,"[-9.366823, 5.9253073, -9.659615, 9.670488, -1..."
2,7UmFN7b9Y1,Low_Frequency_Burst,1.135626e+09,"[-8.250671, 23.62479, -11.17448, 16.97029, -14..."
3,xmxiRoeHxh,Whistle,1.132637e+09,"[-20.89406, 42.58717, -22.497952, 42.539593, -..."
4,zKCTakFVcf,Whistle,1.127425e+09,"[72.466286, -149.22174, 86.74397, -135.45528, ..."
...,...,...,...,...
1090,gBJMU5wqpw,Power_Line,1.128706e+09,"[-28.359324, 37.06308, -22.323925, 42.60889, -..."
1091,ci9lhE4WGH,Scratchy,1.132121e+09,"[0.3554791, 1.6280116, 9.362918, 8.372892, 6.2..."
1092,0mnZ4kREdT,Low_Frequency_Lines,1.136275e+09,"[37.082584, -49.28117, 33.413063, -34.63778, 3..."
1093,LJKENwtrdZ,Low_Frequency_Lines,1.137096e+09,"[-2.8705487, -4.4042997, -1.3368847, -3.783266..."


In [8]:
df_LIGO=build_dataset()

removed timeseries: 4
[359, 375, 688, 953]
removed timeseries: 31
[1, 9, 42, 44, 51, 56, 72, 87, 90, 181, 252, 306, 309, 331, 350, 427, 467, 508, 520, 531, 558, 578, 653, 683, 685, 744, 748, 791, 809, 824, 836]
removed timeseries: 24
[72, 73, 105, 106, 120, 132, 134, 136, 158, 282, 286, 300, 348, 352, 368, 378, 419, 435, 449, 477, 537, 551, 655, 674]
removed timeseries: 12
[59, 76, 79, 129, 172, 202, 296, 371, 489, 542, 752, 783]
removed timeseries: 21
[29, 30, 69, 98, 143, 180, 199, 219, 220, 221, 235, 367, 419, 438, 549, 598, 622, 627, 648, 658, 708]
removed timeseries: 17
[36, 70, 115, 149, 269, 271, 287, 307, 359, 363, 364, 462, 525, 557, 570, 622, 772]
removed timeseries: 27
[0, 52, 66, 84, 97, 110, 120, 253, 297, 313, 325, 336, 406, 431, 455, 477, 498, 539, 565, 571, 608, 621, 698, 728, 764, 771, 783]


In [None]:
df_LIGO

Unnamed: 0,ID,Type_Of_Glitch,Starting_Times,TimeSeries
0,sV3eoXQPMc,Blip,1.128337e+09,"[-4.3496227, 3.2480447, -4.878654, 7.2891717, ..."
1,W7oI3wLRz6,Repeating_Blips,1.136393e+09,"[-9.366823, 5.9253073, -9.659615, 9.670488, -1..."
2,7UmFN7b9Y1,Low_Frequency_Burst,1.135626e+09,"[-8.250671, 23.62479, -11.17448, 16.97029, -14..."
3,xmxiRoeHxh,Whistle,1.132637e+09,"[-20.89406, 42.58717, -22.497952, 42.539593, -..."
4,zKCTakFVcf,Whistle,1.127425e+09,"[72.466286, -149.22174, 86.74397, -135.45528, ..."
...,...,...,...,...
833,kbkBKCGl4w,Blip,1.135794e+09,"[10.382145, 1.8822113, 9.213575, 8.677121, 4.4..."
834,yqsc621GfU,Koi_Fish,1.126804e+09,"[-18.155645, 22.470346, -18.770136, 28.99813, ..."
835,3ueZLyH9jg,Blip,1.128324e+09,"[2.0149, 5.768071, 0.8069777, 5.049079, -1.145..."
836,8nRpAbS5B9,Koi_Fish,1.131693e+09,"[-5.7416744, 8.719219, -1.3466268, 10.600095, ..."


In [None]:
df_LIGO['Type_Of_Glitch'].value_counts()

Blip                   1707
Koi_Fish                598
Low_Frequency_Burst     557
Power_Line              432
Low_Frequency_Lines     416
Scattered_Light         415
Extremely_Loud          287
Scratchy                247
Light_Modulation        221
Whistle                 144
1080Lines               140
No_Glitch               137
Tomte                   100
Repeating_Blips          79
1400Ripples              75
Chirp                    53
Air_Compressor           51
None_of_the_Above        40
Paired_Doves             15
Wandering_Line            6
Violin_Mode               2
Name: Type_Of_Glitch, dtype: int64

In [9]:
rem=True
if rem:
  df_LIGO=remove_row_by_freq(df_LIGO,freq=400)
df_LIGO['Type_Of_Glitch'].value_counts()

Blip                   1707
Koi_Fish                598
Low_Frequency_Burst     557
Power_Line              432
Low_Frequency_Lines     416
Scattered_Light         415
Name: Type_Of_Glitch, dtype: int64

In [10]:
train_set, val_set, test_set = split_stratified_into_train_val_test(df_LIGO)

In [20]:
train_set=pytorch_glitch(train_set)
val_set=pytorch_glitch(val_set)

In [21]:
np.savez('/content/pytorch-wavenet/TEST',train_set)

  val = np.asanyarray(val)
  val = np.asanyarray(val)


KeyboardInterrupt: ignored

In [12]:
import time
from wavenet_model import *
from audio_data import WavenetDataset
from wavenet_training import *
from model_logging import *
#from optimizers import SGDNormalized
from scipy.io import wavfile

dtype = torch.FloatTensor
ltype = torch.LongTensor

use_cuda = torch.cuda.is_available()
if use_cuda:
    print('use gpu')
    dtype = torch.cuda.FloatTensor
    ltype = torch.cuda.LongTensor

In [13]:
model = WaveNetModel(layers=6,
                     blocks=4,
                     dilation_channels=16,
                     residual_channels=16,
                     skip_channels=32,
                     output_length=8,
                     dtype=dtype,
                    bias=False)
model = load_latest_model_from('snapshots', use_cuda=use_cuda)
#model = torch.load('snapshots/saber_model_2017-12-18_20-47-36', map_location=lambda storage, loc: storage)
model.dtype = dtype
if use_cuda:
    model.cuda()
else:
    model.cpu()
print('model: ', model)
print('receptive field: ', model.receptive_field)
print('parameter count: ', model.parameter_count())

load model snapshots/chaconne_model_2017-12-28_16-44-12




model:  WaveNetModel(
  (filter_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (gate_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(2,), stride=(1,))
  )
  (residual_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 32, kernel_size=(1,), stride=(1,))
  )
  (skip_convs): ModuleList(
    (0-29): 30 x Conv1d(32, 1024, kernel_size=(1,), stride=(1,))
  )
  (start_conv): Conv1d(256, 32, kernel_size=(1,), stride=(1,))
  (end_conv_1): Conv1d(1024, 512, kernel_size=(1,), stride=(1,))
  (end_conv_2): Conv1d(512, 256, kernel_size=(1,), stride=(1,))
)
receptive field:  3070
parameter count:  1834592


In [88]:
np.savez('/content/pytorch-wavenet/TEST',train_set)

In [None]:
/content/pytorch-wavenet/TEST.npz

In [14]:
import numpy as np
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

In [15]:
data = WavenetDataset(dataset_file='/content/pytorch-wavenet/TEST.npz',
                      item_length=model.receptive_field + model.output_length - 1,
                      target_length=model.output_length,
                      file_location='train_samples/saber',
                      test_stride=20)

one hot input


In [17]:
print('the dataset has ' + str(len(data)) + ' items')

the dataset has 1 items


In [18]:
def generate_and_log_samples(step):
    sample_length=4000
    gen_model = load_latest_model_from('snapshots')
    print("start generating...")
    samples = generate_audio(gen_model,
                             length=sample_length,
                             temperatures=[0])
    tf_samples = tf.convert_to_tensor(samples, dtype=tf.float32)
    logger.audio_summary('temperature 0', tf_samples, step, sr=16000)

    samples = generate_audio(gen_model,
                             length=sample_length,
                             temperatures=[0.5])
    tf_samples = tf.convert_to_tensor(samples, dtype=tf.float32)
    logger.audio_summary('temperature 0.5', tf_samples, step, sr=16000)
    print("audio clips generated")