# CNN implementation

In [1]:
import os

#os.chdir("/drive/MyDrive/data")

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from torchvision.transforms import Compose, ToTensor, RandomAffine, RandomHorizontalFlip, RandomVerticalFlip, ColorJitter
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

import random
import utils

plt.rcParams['figure.figsize'] = (17, 5)

sr = 22050

  "class": algorithms.Blowfish,
  warn(


In [23]:

import warnings

# Disable all warnings
warnings.filterwarnings("ignore")


In [5]:
def saveheavy(a, name, n):  #functions to save havy dataframes in multiple files
    l= len(a)
    if(n>1):
        for i in range(n-1):
            a_i = a[int((l/n)*i):int((l/n)*(i+1))]
            np.save(f'{name}_{i+1}.npy', a_i)
    a_i = a[int((l/n)*(n-1)):l]
    np.save(f'{name}_{n}.npy', a_i)
    
def readheavy(name, n, Dir):
    a = np.array([0,0])
    for i in range(n):
        new_a = np.load(f'Data/{Dir}/{name}_{i+1}.npy', allow_pickle = True)
        a = np.vstack([a, new_a])
    return a[1:]

In [6]:
#Restore Datasets

test = readheavy('test', 2, 'Audio')
training = readheavy('training', 16, 'Audio')

In [11]:
training.shape

(6394, 2)

In [7]:
def get_stft(a):
    for j in range(len(a)):
        audio = a[j, 0]
        stft = np.abs(librosa.stft(audio, n_fft=1024, hop_length=512))
        a[j ,0] = stft
    return a

def get_mel(a):
    for j in range(len(a)):
        audio = a[j,0]
        stft = np.abs(librosa.stft(audio, n_fft=1024, hop_length=512))
        mel = mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
        a[j,0] = mel
    return a

In [8]:
#Now I convert Audio into stft data

test = get_stft(test)

In [None]:
validation = get_stft(validation)
training = get_stft(training)

In [9]:
def clip_stft(a, n_samples):
    a_clip = np.array([0,0])
    for j in range(len(a)):
        full = a[j, 0].T
        n=0
        while (n<(len(full)-n_samples)):
            clip = full[n: (n+n_samples)]
            y = a[j, 1]
            new_row = np.array([clip, new_row])
            a_clip = np.vstack([a_clip, new_row)
            n+=int(n_samples/2)
    return a_clip[1:]

def clip_audio(df, n_samples):
    a_clip = np.array([0,0])
    for j in range(len(a)):
        full = a[j, 0]
        n=0
        while (n<(len(full)-n_samples)):
            clip = full[n: (n+n_samples)]
            y = a[j, 1]
            new_row = np.array([clip, new_row])
            a_clip = np.vstack([a_clip, new_row)
            n+=int(n_samples/2)
    return a_clip[1:]
          

In [10]:
test_clip = clip_stft(test, 128)

In [10]:
validation_clip = clip_stft(validation, 128)
training_clip = clip_stft(training, 128)

In [11]:
test_clip['stft'][0].shape

(128, 513)

In [12]:
from torch.utils.data import Dataset, DataLoader

#Class for the creation of torch manageble datasets, with Format one can select the desired input column 
class DataAudio(Dataset):

    def __init__(self, split, transform=None):
        self.x = split[0]
        self.y = split[1]
        self.transform = transform

    def __len__(self):
        #return min(32000, len(self.x))
        return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx, 0, 0].astype(float)
        if self.transform:
            x = self.transform(x)
        return x, y

In [13]:
test_clip['stft'][1].shape

(128, 513)

In [14]:
transforms = Compose([
    ToTensor(), #this converts numpy or Pil image to torch tensor and normalizes it in 0, 1
    #RandomAffine((0.05, 0.05)),
    #RandomHorizontalFlip(),      #Per ruotare immagini
    #RandomVerticalFlip()
])

In [15]:
#Creation of torch suited dataset classes  (change string 'mel' to select desired Format)
test_dataset = DataAudio(test_clip, 'stft',transforms)


In [None]:
validation_dataset = DataAudio(validation_clip, 'stft',transforms)
training_dataset = DataAudio(training_clip, 'stft',transforms)

In [16]:
#Creation of dataloader classes
batch_size = 10
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False, num_workers=os.cpu_count())


In [None]:
validation_dataloader = DataLoader(validation_dataset, batch_size, shuffle=False, num_workers=os.cpu_count())
training_dataloader = DataLoader(training_dataset, batch_size, shuffle=False, num_workers=os.cpu_count())

In [17]:
# Remember to add dropout layers 

class NNET1(nn.Module):
    
    def __init__(self):
        super(NNET1, self).__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=128,kernel_size=(4,513)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,1)),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(4,1)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,1)),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(4,1)),
            nn.ReLU(),
        )

        # Input of fc1 is 256

        self.fc = nn.Sequential(
            nn.Linear(256, 300),
            nn.ReLU(),
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Linear(150, 10),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        x = self.conv(x)
        max_pool = F.max_pool2d(x, kernel_size=(26,1))
        avg_pool = F.avg_pool2d(x, kernel_size=(26,1))
        x = max_pool + avg_pool
        x = self.fc(x.view(-1, 256))
        
        return x

In [18]:
class NNET2(nn.Module):
        
    def __init__(self):
        super(NNET2, self).__init__()
        
        
        self.c1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=256,kernel_size=(4,513)),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )

        self.c2 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(4, 1),padding=(2,0)),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )

        self.c3 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(4, 1),padding=(1,0)),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        
        self.fc = nn.Sequential(
            nn.Linear(256, 300),
            nn.ReLU(),
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Linear(150, 10),
            nn.Softmax(dim=1)
        )

    def forward(self,x):
        
        c1 = self.c1(x)
        c2 = self.c2(c1)
        c3 = self.c3(c2)
        x = c1 + c3
        max_pool = F.max_pool2d(x, kernel_size=(125,1))
        avg_pool = F.avg_pool2d(x, kernel_size=(125,1))
        x = max_pool + avg_pool
        x = self.fc(x.view(-1, 256))
        return x 




In [19]:
# Fake spectrogram 128x513
x = torch.randn(1, 1, 128, 513)

# Create model
model = NNET2()


# Forward pass
output = model(x)

# Print output shape
print(output.shape)

torch.Size([1, 10])


In [21]:
len(test_dataloader.dataset)

38

In [54]:
test_dataloader.dataset.x

0     [[0.00021080021, 0.00020972929, 0.00020654705,...
1     [[2.1542284, 3.2158666, 22.255291, 23.587381, ...
2     [[1.5419899, 8.78172, 25.200851, 50.310005, 74...
3     [[2.275202, 2.9461339, 5.1720843, 12.862928, 1...
4     [[2.190529, 3.119165, 4.2509694, 18.54553, 37....
5     [[0.554729, 13.509325, 27.020035, 20.695541, 1...
6     [[2.057228, 4.084349, 5.652993, 8.063932, 8.70...
7     [[1.0614991, 4.8038635, 10.438138, 32.33236, 3...
8     [[0.9206511, 9.127373, 29.346125, 32.353477, 1...
9     [[2.5374625, 6.8586917, 44.346695, 50.825443, ...
10    [[0.24397269, 1.8173068, 12.106457, 21.054564,...
11    [[1.2853845, 21.224907, 61.50625, 53.425743, 2...
12    [[0.34479684, 3.5025811, 13.639755, 16.223742,...
13    [[0.22271799, 5.5677457, 3.4634311, 21.67279, ...
14    [[0.9326537, 9.896021, 32.207104, 28.57739, 32...
15    [[0.39727607, 16.838356, 37.255775, 50.28999, ...
16    [[1.1973219, 10.425424, 31.496302, 33.228195, ...
17    [[0.080669165, 21.352627, 68.46717, 56.739

In [None]:
sample = next(iter(test_dataloader))
print(sample)

In [None]:
from torch.optim import SGD, Adam, Adadelta
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from tqdm import tqdm

#iterator = tqdm(test_dataloader)
for batch_x, batch_y in test_dataloader:
#    batch_x = batch_x.to(device)
#    batch_y = batch_y.to(device)
    print(batch_x)
    print(batch_y)

In [None]:
from torch.optim import SGD, Adam, Adadelta
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from tqdm import tqdm



opt = Adadelta(model.parameters(), lr=1e-2, weight_decay = 0)
loss_fn = CrossEntropyLoss()
device = "cuda" if torch.cuda.is_available() else "cpu"

model.to(device)
epochs=2
best_val = np.inf
for epoch in range(epochs):
    model.train()
    print(f"Epoch: {epoch+1}")
    iterator = tqdm(test_dataloader)
    for batch_x, batch_y in iterator:
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        y_pred = model(batch_x)

        loss = loss_fn(y_pred, batch_y)

        opt.zero_grad()
        loss.backward()
        opt.step()
        iterator.set_description(f"Train loss: {loss.detach().cpu().numpy()}")

    model.eval()
    with torch.no_grad():
        predictions = []
        true = []
        for batch_x, batch_y in tqdm(validation_dataloader):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            y_pred = model(batch_x)

            predictions.append(y_pred)
            true.append(batch_y)
        predictions = torch.cat(predictions, axis=0)
        true = torch.cat(true, axis=0)
        val_loss = loss_fn(predictions, true)
        val_acc = (torch.sigmoid(predictions).round() == true).float().mean()
        print(f"loss: {val_loss}, accuracy: {val_acc}")
    
    if val_loss < best_val:
        print("Saved Model")
        torch.save(model.state_dict(), "model.pt")
        best_val = val_loss

Epoch: 1


  0%|                                                                                          | 0/190 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_curve, roc_auc_score

def evaluate_network(dataloader, model, data_split):
    model.eval()
    with torch.no_grad():
        predictions = []
        true = []
        for batch_x, batch_y in tqdm(dataloader):
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            y_pred = model(batch_x)

            predictions.append(y_pred)
            true.append(batch_y)
        predictions = torch.cat(predictions, axis=0)
        true = torch.cat(true, axis=0)
        loss = loss_fn(predictions, true).detach().cpu().numpy()
        predictions = torch.sigmoid(predictions).detach().cpu().numpy()
        true = true.detach().cpu().numpy()

        fpr, tpr, thresholds = roc_curve(true, predictions)
        auc = roc_auc_score(true, predictions)
        predictions = predictions.round()
        precision, recall, fscore, _= precision_recall_fscore_support(true, predictions, average='binary')
        accuracy = accuracy_score(true, predictions)

        print(f"{data_split} loss: {loss}, accuracy: {accuracy}, precision: {precision}, recall: {recall}, f1: {fscore}, roc_auc: {auc}")

        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{data_split} receiver operating characteristic (ROC)')
        plt.legend(loc="lower right")