<a href="https://colab.research.google.com/github/FrancescoSarandrea/Audio_GW/blob/main/GS_TorchLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Simple notebook to convert gravity spy samples to pytorch datasets, an implementation of a logistic regression is also provided.

In [3]:
from google.colab import drive
drive.mount('/content/drive')  

Mounted at /content/drive


In [4]:
try:
    from gwpy.timeseries import TimeSeries
except ModuleNotFoundError: 
    !pip install --quiet gwpy
    from gwpy.timeseries import TimeSeries

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ligo-segments (setup.py) ... [?25l[?25hdone


In [5]:
import os
import h5py as h5
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchaudio
import sys
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from torchsummary import summary

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [18]:
def construct_dataframe(path,t_delta=None):
  name_data=[]
  starting_times_data=[]
  timeseries_data=[]
  type_data=[]
  for file in listdir(path):
    fout=h5.File(path+file,'r') 
    type_of_glitch=list(fout.keys())[0]
    dataset=fout[type_of_glitch]
    t = TimeSeries(dataset[()])
    ln=len(t)/2
    #if no t_delta is provided, the whole the timeseries are not cut 
    if t_delta==None:
      t_delta=ln
    name_data.append(file.split('.')[0].split('_')[0])
    type_data.append(type_of_glitch)
    starting_times_data.append(dataset.attrs['t0'])
    dataset_cut=dataset[()][int(ln-t_delta):int(ln+t_delta)]
    timeseries_data.append(dataset_cut)
    # intialise data of lists.
  data_Virgo = {'ID':name_data,
        'Type_Of_Glitch':type_data,
        'Starting_Times':starting_times_data,
        'TimeSeries':timeseries_data}
    # Create DataFrame
  df= pd.DataFrame(data_Virgo)
  return df

def check_remove_nanDF(dataframe, col='TimeSeries'):
    val=dataframe[col].values
    idx=[]
    for i in range(len(val)):
        if np.isnan(val[i])[0]==True:
            idx+=[i]
    print("removed timeseries:", len(idx))
    return dataframe.drop(labels=idx,axis=0)


def build_torchDataset(Dataset, val_shape=200, norm=True):
    X,y=Dataset.T
    data=[]
    label=[]
    for element in X:
        data+= [element]
    for i in y:
        label+=[i]
    X_t=torch.tensor(np.asarray(data)).unsqueeze(1)
    y_t=torch.tensor(label).long()
    if norm:
        X_t=(X_t-X_t.mean(0))/X_t.std(0)
    X_train, X_val=X_t[:-val_shape], X_t[-val_shape:]
    y_train, y_val=y_t[:-val_shape], y_t[-val_shape:]
    return TensorDataset(X_train,y_train), TensorDataset(X_val,y_val)

def count_classes(dataset):
    classes=torch.empty(1)
    for _, label in dataset:
        classes=torch.cat((classes,label.view(1)))
    #print(classes)
    label=torch.arange(0,classes.max()+1,1)
    freq=torch.bincount(classes.long())
    counts=torch.vstack((label,freq))
    return pd.DataFrame(freq)

def check_classes(train,val,labels):
    df1=count_classes(train)
    df1.rename(columns = {0:'Train set'},inplace=True)
    df2=count_classes(val)
    df2.rename(columns = {0:'Test set'},inplace=True)
    df3=pd.concat([df1,df2],axis=1)
    df3.index = labels
    return df3

In [8]:
!unzip -q /content/drive/MyDrive/GW_ML/Gravity_Spy_Glitches_whitened_1.zip 

In [9]:
df_LIGO=construct_dataframe('/content/content/sample_data/Gravity_Spy/')

In [10]:
df_LIGO.head()

Unnamed: 0,ID,Type_Of_Glitch,Starting_Times,TimeSeries
0,ETeyjFLNXi,No_Glitch,1126766000.0,"[-0.07380686, 3.3789918, 2.515241, 0.77137023,..."
1,hPyYFvFLBV,No_Glitch,1135921000.0,"[-5.61079, 32.08648, -12.678058, 20.615831, -5..."
2,cuiN0cvLl3,Low_Frequency_Burst,1134997000.0,"[-0.0029804548, -5.6829243, 7.361643, 5.990745..."
3,zJ8niJL4Cy,Low_Frequency_Lines,1134565000.0,"[-77.5008, 92.88992, -75.61221, 85.37933, -65...."
4,vQLnNoG549,Scattered_Light,1134176000.0,"[0.5169164, 4.872511, 1.070817, 7.105692, 7.82..."


In [11]:
df_LIGO=check_remove_nanDF(df_LIGO)

removed timeseries: 4


In [12]:
df_LIGO.Type_Of_Glitch = pd.Categorical(df_LIGO.Type_Of_Glitch)

In [13]:
df_LIGO['code']=df_LIGO.Type_Of_Glitch.cat.codes

In [14]:
df_LIGO.head()

Unnamed: 0,ID,Type_Of_Glitch,Starting_Times,TimeSeries,code
0,ETeyjFLNXi,No_Glitch,1126766000.0,"[-0.07380686, 3.3789918, 2.515241, 0.77137023,...",7
1,hPyYFvFLBV,No_Glitch,1135921000.0,"[-5.61079, 32.08648, -12.678058, 20.615831, -5...",7
2,cuiN0cvLl3,Low_Frequency_Burst,1134997000.0,"[-0.0029804548, -5.6829243, 7.361643, 5.990745...",5
3,zJ8niJL4Cy,Low_Frequency_Lines,1134565000.0,"[-77.5008, 92.88992, -75.61221, 85.37933, -65....",6
4,vQLnNoG549,Scattered_Light,1134176000.0,"[0.5169164, 4.872511, 1.070817, 7.105692, 7.82...",12


In [15]:
labels = sorted(list(set(datapoint[1] for datapoint in df_LIGO.to_numpy())))

In [16]:
labels

['Blip',
 'Chirp',
 'Extremely_Loud',
 'Koi_Fish',
 'Light_Modulation',
 'Low_Frequency_Burst',
 'Low_Frequency_Lines',
 'No_Glitch',
 'None_of_the_Above',
 'Paired_Doves',
 'Power_Line',
 'Repeating_Blips',
 'Scattered_Light',
 'Scratchy',
 'Tomte',
 'Violin_Mode',
 'Wandering_Line',
 'Whistle']

In [17]:
#Count of classes:
df1 = df_LIGO['Type_Of_Glitch'].value_counts()
print(df1)
del df1

Low_Frequency_Burst    193
Scattered_Light        177
Low_Frequency_Lines    155
Power_Line             117
Blip                   104
Scratchy                85
Whistle                 80
Tomte                   74
No_Glitch               44
Repeating_Blips         26
Light_Modulation        23
None_of_the_Above        5
Wandering_Line           3
Chirp                    1
Koi_Fish                 1
Extremely_Loud           1
Violin_Mode              1
Paired_Doves             1
Name: Type_Of_Glitch, dtype: int64


In [19]:
train_set, test_set = build_torchDataset(df_LIGO.drop(columns=['ID', 'Type_Of_Glitch', 'Starting_Times']).values)

In [24]:
check_classes(train_set, test_set,labels) #If this cell give an overflow error, run it again untill it works, idk why

Unnamed: 0,Train set,Test set
Blip,84,22
Chirp,1,0
Extremely_Loud,1,0
Koi_Fish,0,1
Light_Modulation,21,2
Low_Frequency_Burst,157,36
Low_Frequency_Lines,133,22
No_Glitch,32,12
None_of_the_Above,5,0
Paired_Doves,0,1


In [24]:
sample_rate=34864

In [25]:
batch_size = 32

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    #collate_fn=collate_fn,
    pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    #collate_fn=collate_fn,
    pin_memory=pin_memory,
)

In [26]:
def evaluate_accuracy(model,transform, dataloader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in dataloader:
            images, y = data
            if transform:
                images=transform(images.to(device))
            else:
                images=images.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)  
            total += y.size(0) 
            correct += (predicted == y.to(device)).sum().item()
    accuracy = 100 * correct / total  
    return accuracy

def accuracy_classes(net,transform,dataloader,classes): 
    correct_pred = {classname: 0 for classname in classes}
    total_pred = {classname: 0 for classname in classes}
    net.eval()
    with torch.no_grad():
        for data in dataloader:
            images, labels = data
            if transform:
                images=transform(images.to(device))
            else:
                images=images.to(device)
            outputs = net(images)
            _, predictions = torch.max(outputs, 1)
            for label, prediction in zip(labels.to(device), predictions):
                if label == prediction:
                    correct_pred[classes[label]] += 1
                total_pred[classes[label]] += 1

    for classname, correct_count in correct_pred.items():
        accuracy = 100 * float(correct_count) / total_pred[classname]
        print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')
     

In [27]:
def training(net,transform,optimizer,criterion,train_loader,test_loader,num_epochs):
  history={'loss':[],'train_accuracy':[],'val_accuracy':[]}
  for epoch in tqdm(range(num_epochs)):
      running_loss = 0.0
      running_corrects = 0.0
      net.train()
      for i, data in enumerate(train_loader, 0):
          inputs, y = data
          optimizer.zero_grad()
          if transform:
            inputs=transform(inputs.to(device))
          else:
            inputs=inputs.to(device)
          outputs = net(inputs)

          loss = criterion(outputs, y.to(device))
          loss.backward()
          optimizer.step()
 
          _, predicted = torch.max(outputs.data.cpu(), 1)
          running_corrects += torch.sum(predicted == y.data).item()
        
          running_loss += loss.item() * inputs.size(0)
        

      epoch_loss = running_loss / len(train_loader.dataset)
      epoch_acc = 100.0*running_corrects / len(train_loader.dataset)
      val_acc=evaluate_accuracy(net,transform,test_loader)
      history['loss'].append(epoch_loss)
      history['train_accuracy'].append(epoch_acc)
      history['val_accuracy'].append(val_acc)
      print('Epoch {} Loss: {:.4f} Acc: {:.4f}% Val Acc: {:.4f}%'.format(epoch+1, epoch_loss, epoch_acc, val_acc))  
  return history

In [28]:
class LogisticRegression(nn.Module):
    def __init__(self, n_input, n_output=18):
        super().__init__()
        self.fc1 = nn.Linear(n_input, n_output)
    def forward(self, x):
        x=x.view(x.shape[0],-1)
        x = self.fc1(x)
        return F.log_softmax(x, dim=1).squeeze()

In [29]:
T=20000

In [30]:
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=T).to(device)

In [36]:
model = LogisticRegression(n_input=T, n_output=18).to(device)    

In [37]:
criterion = nn.NLLLoss()

In [38]:
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=0.001)

In [39]:
summary(model,(1,T))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 18]         360,018
Total params: 360,018
Trainable params: 360,018
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.08
Forward/backward pass size (MB): 0.00
Params size (MB): 1.37
Estimated Total Size (MB): 1.45
----------------------------------------------------------------


In [40]:
num_epochs=100

In [41]:
history=training(model,transform,optimizer,criterion,train_loader,test_loader,num_epochs)

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1 Loss: 2.9160 Acc: 6.9585% Val Acc: 6.5000%
Epoch 2 Loss: 2.5393 Acc: 44.2200% Val Acc: 11.0000%
Epoch 3 Loss: 2.3061 Acc: 62.4018% Val Acc: 13.5000%
Epoch 4 Loss: 2.1284 Acc: 70.4826% Val Acc: 15.5000%
Epoch 5 Loss: 1.9826 Acc: 73.9618% Val Acc: 16.0000%
Epoch 6 Loss: 1.8610 Acc: 77.2166% Val Acc: 16.0000%
Epoch 7 Loss: 1.7524 Acc: 78.3389% Val Acc: 17.0000%
Epoch 8 Loss: 1.6574 Acc: 80.1347% Val Acc: 15.5000%
Epoch 9 Loss: 1.5741 Acc: 81.5937% Val Acc: 15.5000%
Epoch 10 Loss: 1.4976 Acc: 83.3895% Val Acc: 16.0000%
Epoch 11 Loss: 1.4286 Acc: 83.8384% Val Acc: 16.0000%
Epoch 12 Loss: 1.3659 Acc: 85.0730% Val Acc: 16.5000%
Epoch 13 Loss: 1.3093 Acc: 86.4198% Val Acc: 16.5000%
Epoch 14 Loss: 1.2562 Acc: 87.4299% Val Acc: 17.5000%
Epoch 15 Loss: 1.2059 Acc: 88.2155% Val Acc: 17.0000%
Epoch 16 Loss: 1.1606 Acc: 88.7767% Val Acc: 17.5000%
Epoch 17 Loss: 1.1191 Acc: 89.7868% Val Acc: 18.0000%
Epoch 18 Loss: 1.0786 Acc: 91.0213% Val Acc: 17.5000%
Epoch 19 Loss: 1.0406 Acc: 91.5825% Val

In [42]:
accuracy_classes(model,transform,train_loader,labels)

Accuracy for class: Blip  is 100.0 %
Accuracy for class: Chirp is 100.0 %
Accuracy for class: Extremely_Loud is 100.0 %


ZeroDivisionError: ignored

In [43]:
accuracy_classes(model,transform,test_loader,labels)

Accuracy for class: Blip  is 33.3 %


ZeroDivisionError: ignored

In [None]:
#LOGSCATTERING