<a href="https://colab.research.google.com/github/FrancescoSarandrea/Audio_GW/blob/main/GS_TorchLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Simple notebook to convert gravity spy samples to pytorch datasets, an implementation of a logistic regression is also provided.

In [1]:
from google.colab import drive
drive.mount('/content/drive')  

Mounted at /content/drive


In [2]:
try:
    from gwpy.timeseries import TimeSeries
except ModuleNotFoundError: 
    !pip install --quiet gwpy
    from gwpy.timeseries import TimeSeries

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ligo-segments (setup.py) ... [?25l[?25hdone


In [3]:
import os
import h5py as h5
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from torchsummary import summary

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
def construct_dataframe(path,t_delta=None):
  name_data=[]
  starting_times_data=[]
  timeseries_data=[]
  type_data=[]
  for file in listdir(path):
    if file != '.ipynb_checkpoints':
        fout=h5.File(path+file,'r') 
    type_of_glitch=list(fout.keys())[0]
    dataset=fout[type_of_glitch]
    t = TimeSeries(dataset[()])
    ln=len(t)/2
    #if no t_delta is provided, the whole the timeseries are not cut 
    if t_delta==None:
      t_delta=ln
    name_data.append(file.split('.')[0].split('_')[0])
    type_data.append(type_of_glitch)
    starting_times_data.append(dataset.attrs['t0'])
    dataset_cut=dataset[()][int(ln-t_delta):int(ln+t_delta)]
    timeseries_data.append(dataset_cut)
    # intialise data of lists.

  data_Virgo = {'ID':name_data,
        'Type_Of_Glitch':type_data,
        'Starting_Times':starting_times_data,
        'TimeSeries':timeseries_data}
    # Create DataFrame
  df= pd.DataFrame(data_Virgo)
  return df

def check_remove_nanDF(dataframe, col='TimeSeries'):
    val=dataframe[col].values
    idx=[]
    for i in range(len(val)):
        if np.isnan(val[i])[0]==True:
            idx+=[i]
    print("removed timeseries:", len(idx))
    return dataframe.drop(labels=idx,axis=0)


def build_torchDataset(Dataset, val_shape=200, norm=True):
    X,y=Dataset.T
    data=[]
    label=[]
    for element in X:
        data+= [element]
    for i in y:
        label+=[i]
    X_t=torch.tensor(np.asarray(data)).unsqueeze(1)
    y_t=torch.tensor(label).long()
    if norm:
        X_t=(X_t-X_t.mean(0))/X_t.std(0)
    X_train, X_val=X_t[:-val_shape], X_t[-val_shape:]
    y_train, y_val=y_t[:-val_shape], y_t[-val_shape:]
    return TensorDataset(X_train,y_train), TensorDataset(X_val,y_val)

def count_classes(dataset):
    classes=torch.empty(1)
    for _, label in dataset:
        classes=torch.cat((classes,label.view(1)))
    #print(classes)
    label=torch.arange(0,classes.max()+1,1)
    freq=torch.bincount(classes.long())
    counts=torch.vstack((label,freq))
    return pd.DataFrame(freq)

def check_classes(train,val,labels):
    df1=count_classes(train)
    df1.rename(columns = {0:'Train set'},inplace=True)
    df2=count_classes(val)
    df2.rename(columns = {0:'Test set'},inplace=True)
    df3=pd.concat([df1,df2],axis=1)
    df3.index = labels
    return df3

In [6]:
!unzip -q /content/drive/MyDrive/GW_ML/Gravity_Spy_Glitches_whitened_1.zip 
!unzip -q /content/drive/MyDrive/GW_ML/Gravity_Spy_Glitches_whitened_2.zip
!unzip -q /content/drive/MyDrive/GW_ML/Gravity_Spy_Glitches_whitened_3.zip 
!unzip -q /content/drive/MyDrive/GW_ML/Gravity_Spy_Glitches_whitened_4.zip  

In [7]:
df_LIGO=construct_dataframe('/content/content/sample_data/Gravity_Spy/')

In [8]:
df_LIGO.head()

Unnamed: 0,ID,Type_Of_Glitch,Starting_Times,TimeSeries
0,NyRT1i4JUz,Scratchy,1129836000.0,"[-36.20354, 26.148184, -25.498957, 22.738546, ..."
1,1pWDp3xjUi,Low_Frequency_Burst,1135530000.0,"[-28.990265, 2.939058, -34.67333, -3.4040651, ..."
2,vDeLwvZOYx,Light_Modulation,1127317000.0,"[-3.5203443, 19.16648, -6.5227933, 15.003055, ..."
3,zXWepZZaGx,Low_Frequency_Lines,1135680000.0,"[16.132532, 4.529822, -0.5751773, -4.243759, 0..."
4,rJIoh2dpVs,Blip,1132807000.0,"[15.367173, -12.814704, 7.7153554, -14.192832,..."


In [9]:
df_LIGO=check_remove_nanDF(df_LIGO)

removed timeseries: 4


In [10]:
df_LIGO.Type_Of_Glitch = pd.Categorical(df_LIGO.Type_Of_Glitch)

In [11]:
df_LIGO['code']=df_LIGO.Type_Of_Glitch.cat.codes

In [12]:
df_LIGO.head()

Unnamed: 0,ID,Type_Of_Glitch,Starting_Times,TimeSeries,code
0,NyRT1i4JUz,Scratchy,1129836000.0,"[-36.20354, 26.148184, -25.498957, 22.738546, ...",13
1,1pWDp3xjUi,Low_Frequency_Burst,1135530000.0,"[-28.990265, 2.939058, -34.67333, -3.4040651, ...",5
2,vDeLwvZOYx,Light_Modulation,1127317000.0,"[-3.5203443, 19.16648, -6.5227933, 15.003055, ...",4
3,zXWepZZaGx,Low_Frequency_Lines,1135680000.0,"[16.132532, 4.529822, -0.5751773, -4.243759, 0...",6
4,rJIoh2dpVs,Blip,1132807000.0,"[15.367173, -12.814704, 7.7153554, -14.192832,...",0


In [13]:
labels = sorted(list(set(datapoint[1] for datapoint in df_LIGO.to_numpy())))

In [14]:
labels

['Blip',
 'Chirp',
 'Extremely_Loud',
 'Koi_Fish',
 'Light_Modulation',
 'Low_Frequency_Burst',
 'Low_Frequency_Lines',
 'No_Glitch',
 'None_of_the_Above',
 'Paired_Doves',
 'Power_Line',
 'Repeating_Blips',
 'Scattered_Light',
 'Scratchy',
 'Tomte',
 'Violin_Mode',
 'Wandering_Line',
 'Whistle']

In [15]:
#Count of classes:
df1 = df_LIGO['Type_Of_Glitch'].value_counts()
print(df1)
del df1

Low_Frequency_Burst    193
Scattered_Light        177
Low_Frequency_Lines    155
Power_Line             117
Blip                   104
Scratchy                85
Whistle                 80
Tomte                   74
No_Glitch               44
Repeating_Blips         26
Light_Modulation        23
None_of_the_Above        5
Wandering_Line           3
Chirp                    1
Koi_Fish                 1
Extremely_Loud           1
Violin_Mode              1
Paired_Doves             1
Name: Type_Of_Glitch, dtype: int64


In [16]:
train_set, test_set = build_torchDataset(df_LIGO.drop(columns=['ID', 'Type_Of_Glitch', 'Starting_Times']).values)

In [18]:
check_classes(train_set, test_set,labels) #If this cell give an overflow error, run it again untill it works, idk why

Unnamed: 0,Train set,Test set
Blip,88,18
Chirp,1,0
Extremely_Loud,1,0
Koi_Fish,1,0
Light_Modulation,16,7
Low_Frequency_Burst,150,43
Low_Frequency_Lines,129,26
No_Glitch,37,7
None_of_the_Above,4,1
Paired_Doves,1,0


In [19]:
sample_rate=34864

In [20]:
batch_size = 32

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    #collate_fn=collate_fn,
    pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    #collate_fn=collate_fn,
    pin_memory=pin_memory,
)

In [21]:
def evaluate_accuracy(model,transform, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in dataloader:
            images, y = data
            if transform:
                images=transform(images.to(device))
            else:
                images=images.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)  
            total += y.size(0) 
            correct += (predicted == y.to(device)).sum().item()
    accuracy = 100 * correct / total  
    return accuracy

def accuracy_classes(net,transform,dataloader,classes): 
    correct_pred = {classname: 0 for classname in classes}
    total_pred = {classname: 0 for classname in classes}
    net.eval()
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            if transform:
                images=transform(images.to(device))
            else:
                images=images.to(device)
            outputs = net(images)
            _, predictions = torch.max(outputs, 1)
            for label, prediction in zip(labels.to(device), predictions):
                if label == prediction:
                    correct_pred[classes[label]] += 1
                total_pred[classes[label]] += 1

    for classname, correct_count in correct_pred.items():
        accuracy = 100 * float(correct_count) / total_pred[classname]
        print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')
     

In [22]:
def training(net,transform,optimizer,criterion,train_loader,test_loader,num_epochs):
  history={'loss':[],'train_accuracy':[],'val_accuracy':[]}
  for epoch in tqdm(range(num_epochs)):
      running_loss = 0.0
      running_corrects = 0.0
      net.train()
      for i, data in enumerate(train_loader, 0):
          inputs, y = data
          optimizer.zero_grad()
          if transform:
            inputs=transform(inputs.to(device))
          else:
            inputs=inputs.to(device)
          outputs = net(inputs)

          loss = criterion(outputs, y.to(device))
          loss.backward()
          optimizer.step()
 
          _, predicted = torch.max(outputs.data.cpu(), 1)
          running_corrects += torch.sum(predicted == y.data).item()
        
          running_loss += loss.item() * inputs.size(0)
        

      epoch_loss = running_loss / len(train_loader.dataset)
      epoch_acc = 100.0*running_corrects / len(train_loader.dataset)
      val_acc=evaluate_accuracy(net,transform,test_loader)
      history['loss'].append(epoch_loss)
      history['train_accuracy'].append(epoch_acc)
      history['val_accuracy'].append(val_acc)
      print('Epoch {} Loss: {:.4f} Acc: {:.4f}% Val Acc: {:.4f}%'.format(epoch+1, epoch_loss, epoch_acc, val_acc))  
  return history

In [23]:
class LogisticRegression(nn.Module):
    def __init__(self, n_input, n_output=18):
        super().__init__()
        self.fc1 = nn.Linear(n_input, n_output)
    def forward(self, x):
        x=x.view(x.shape[0],-1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=1).squeeze()

In [24]:
T=20000

In [25]:
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=T).to(device)

In [26]:
len(labels)

18

In [27]:
model = LogisticRegression(n_input=T, n_output=len(labels)).to(device)    

In [28]:
criterion = nn.NLLLoss()

In [29]:
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.001)

In [30]:
summary(model,(1,T))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 18]         360,018
Total params: 360,018
Trainable params: 360,018
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.08
Forward/backward pass size (MB): 0.00
Params size (MB): 1.37
Estimated Total Size (MB): 1.45
----------------------------------------------------------------


In [31]:
num_epochs=100

In [32]:
history=training(model,transform,optimizer,criterion,train_loader,test_loader,num_epochs)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1 Loss: 2.9229 Acc: 8.0808% Val Acc: 9.5000%
Epoch 2 Loss: 2.5317 Acc: 45.6790% Val Acc: 14.0000%
Epoch 3 Loss: 2.2990 Acc: 62.6263% Val Acc: 14.5000%
Epoch 4 Loss: 2.1229 Acc: 67.5645% Val Acc: 13.5000%
Epoch 5 Loss: 1.9768 Acc: 70.1459% Val Acc: 13.5000%
Epoch 6 Loss: 1.8535 Acc: 74.5230% Val Acc: 13.5000%
Epoch 7 Loss: 1.7481 Acc: 76.3187% Val Acc: 13.5000%
Epoch 8 Loss: 1.6529 Acc: 78.0022% Val Acc: 14.0000%
Epoch 9 Loss: 1.5690 Acc: 79.7980% Val Acc: 13.0000%
Epoch 10 Loss: 1.4933 Acc: 81.3692% Val Acc: 13.0000%
Epoch 11 Loss: 1.4249 Acc: 82.2671% Val Acc: 13.5000%
Epoch 12 Loss: 1.3619 Acc: 83.8384% Val Acc: 13.5000%
Epoch 13 Loss: 1.3044 Acc: 85.2974% Val Acc: 13.5000%
Epoch 14 Loss: 1.2517 Acc: 87.0932% Val Acc: 13.5000%
Epoch 15 Loss: 1.2028 Acc: 87.2054% Val Acc: 13.5000%
Epoch 16 Loss: 1.1566 Acc: 88.2155% Val Acc: 13.5000%
Epoch 17 Loss: 1.1139 Acc: 88.6644% Val Acc: 13.5000%
Epoch 18 Loss: 1.0745 Acc: 89.5623% Val Acc: 13.5000%
Epoch 19 Loss: 1.0369 Acc: 90.1235% Val

In [33]:
accuracy_classes(model,transform,train_loader,labels)

Accuracy for class: Blip  is 100.0 %
Accuracy for class: Chirp is 100.0 %
Accuracy for class: Extremely_Loud is 100.0 %
Accuracy for class: Koi_Fish is 100.0 %
Accuracy for class: Light_Modulation is 100.0 %
Accuracy for class: Low_Frequency_Burst is 100.0 %
Accuracy for class: Low_Frequency_Lines is 100.0 %
Accuracy for class: No_Glitch is 100.0 %
Accuracy for class: None_of_the_Above is 100.0 %
Accuracy for class: Paired_Doves is 100.0 %
Accuracy for class: Power_Line is 100.0 %
Accuracy for class: Repeating_Blips is 100.0 %
Accuracy for class: Scattered_Light is 100.0 %
Accuracy for class: Scratchy is 100.0 %
Accuracy for class: Tomte is 100.0 %
Accuracy for class: Violin_Mode is 100.0 %
Accuracy for class: Wandering_Line is 100.0 %
Accuracy for class: Whistle is 100.0 %


In [None]:
accuracy_classes(model,transform,test_loader,labels)

Accuracy for class: Blip  is 33.3 %


ZeroDivisionError: ignored

In [None]:
#LOGSCATTERING