# CNN implementation

In [1]:
import os

#os.chdir("/drive/MyDrive/data")

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from torchvision.transforms import Compose, ToTensor, RandomAffine, RandomHorizontalFlip, RandomVerticalFlip, ColorJitter
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

import random
import utils

plt.rcParams['figure.figsize'] = (17, 5)

  "class": algorithms.Blowfish,
  warn(


In [2]:

import warnings

# Disable all warnings
warnings.filterwarnings("ignore")


In [4]:
def saveheavy(df, name, n):  #functions to save havy dataframes in multiple files
    l= len(df)
    for i in range(n-1):
        df.iloc[int((l/n)*i):int((l/n)*(i+1))].to_hdf(f'{name}_{i+1}.h5', 'x', mode='w')
    df.iloc[int((l/n)*(n-1)):l].to_hdf(f'{name}_{n}.h5', 'x', mode='w')
    
def readheavy(name, n, column, Dir):
    result = pd.DataFrame(columns=column)
    for i in range(n):
        df = pd.read_hdf(f'Data/{Dir}/{name}_{i+1}.h5', 'x')
        result = pd.concat([result, df], ignore_index=True)
    return result

In [5]:
#Restore Datasets

test = readheavy('test', 2, ['audio', 'y'], 'Audio')


In [8]:
len(test['stft'][0])

513

In [7]:
validation = readheavy('validation', 2, ['audio', 'y'], 'Audio')
training = readheavy('training', 16, ['audio', 'y'], 'Audio')

In [None]:
def get_stft(data):
    df = pd.DataFrame(columns=['stft', 'y'])
    for j in range(len(data)):
        audio = data.loc[j, 'audio']
        stft = np.abs(librosa.stft(audio, n_fft=1024, hop_length=512))
        y = data.loc[j, 'y']
        df = df.append({'stft': stft, 'y': y,}, ignore_index=True)
    return df

def get_mel(data):
    df = pd.DataFrame(columns=['mel', 'y'])
    for j in range(len(data)):
        audio = data.loc[j, 'audio']
        stft = np.abs(librosa.stft(audio, n_fft=1024, hop_length=512))
        y = data.loc[j, 'y']
        df = df.append({'mel': stft, 'y': y,}, ignore_index=True)
    return df

In [21]:
#Now I concert Audio into stft data

test = get_stft(test)
validation = get_stft(validation)
training = get_stft(training)


In [10]:
def clip_stft(df, n_samples):
    df_clip = pd.DataFrame(columns=list(df.columns))
    t = len(df[df.columns[0]][1][1])
    for j in range(len(df)):
        full = df.loc[j, df.columns[0]].transpose()
        n=0
        while (n<(len(full)-n_samples)):
            clip = full[n: (n+n_samples)]
            y = df.loc[j, 'y']
            df_clip = df_clip.append({df.columns[0]: clip, 'y': y,}, ignore_index=True)
            n+=int(n_samples/2)
    return df_clip

def clip_audio(df, n_samples):
    df_clip = pd.DataFrame(columns=list(df.columns))
    t = len(df[df.columns[0]][1])
    for j in range(len(df)):
        full = df.loc[j, df.columns[0]]
        n=0
        while (n<(len(full)-n_samples)):
            clip = full[n: (n+n_samples)]
            y = df.loc[j, 'y']
            df_clip = df_clip.append({df.columns[0]: clip, 'y': y,}, ignore_index=True)
            n+=int(n_samples/2)
    return df_clip
          

In [11]:
test_clip = clip(test, 128)

In [12]:
validation_clip = clip(validation, 128)
training_clip = clip(training, 128)

In [15]:
training_clip['stft'].shape

(121486,)

In [16]:
from torch.utils.data import Dataset, DataLoader

#Class for the creation of torch manageble datasets, with Format one can select the desired input column 
class DataAudio(Dataset):

    def __init__(self, split, Format, transform=None):
        self.x = split[Format]
        self.y = split['y']
        self.transform = transform

    def __len__(self):
        return min(32000, len(self.x))
        #return len(self.x)

    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx, 0, 0].astype(float)
        if self.transform:
            x = self.transform(x)
        return x, y

In [17]:
test_clip['stft'][1].shape

(128, 513)

In [18]:
transforms = Compose([
    ToTensor(), #this converts numpy or Pil image to torch tensor and normalizes it in 0, 1
    RandomAffine((0.05, 0.05)),
    RandomHorizontalFlip(),
    RandomVerticalFlip()
])

In [19]:
#Creation of torch suited dataset classes  (change string 'mel' to select desired Format)
training_dataset = DataAudio(training_clip, 'stft',transforms)
validation_dataset = DataAudio(validation_clip, 'stft',transforms)
test_dataset = DataAudio(test_clip, 'stft',transforms)

In [20]:
#Creation of dataloader classes
batch_size = 64
training_dataloader = DataLoader(training_dataset, batch_size, shuffle=False, num_workers=os.cpu_count())
validation_dataloader = DataLoader(validation_dataset, batch_size, shuffle=False, num_workers=os.cpu_count())
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False, num_workers=os.cpu_count())

In [None]:
# Remember to add dropout layers 

class NNET1(nn.Module):
    
    def __init__(self):
        super(NNET1, self).__init__()
        
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=128,kernel_size=(4,513)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,1)),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=(4,1)),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2,1)),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(4,1)),
            nn.ReLU(),
        )

        # Input of fc1 is 256

        self.fc = nn.Sequential(
            nn.Linear(256, 300),
            nn.ReLU(),
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Linear(150, 10),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        x = self.conv(x)
        max_pool = F.max_pool2d(x, kernel_size=(26,1))
        avg_pool = F.avg_pool2d(x, kernel_size=(26,1))
        x = max_pool + avg_pool
        x = self.fc(x.view(-1, 256))
        
        return x

In [None]:
class NNET2(nn.Module):
        
    def __init__(self):
        super(NNET2, self).__init__()
        
        
        self.c1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=256,kernel_size=(4,513)),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )

        self.c2 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(4, 1),padding=(2,0)),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )

        self.c3 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=(4, 1),padding=(1,0)),
            nn.BatchNorm2d(256),
            nn.ReLU()
        )
        
        self.fc = nn.Sequential(
            nn.Linear(256, 300),
            nn.ReLU(),
            nn.Linear(300, 150),
            nn.ReLU(),
            nn.Linear(150, 10),
            nn.Softmax(dim=1)
        )

    def forward(self,x):
        
        c1 = self.c1(x)
        c2 = self.c2(c1)
        c3 = self.c3(c2)
        x = c1 + c3
        max_pool = F.max_pool2d(x, kernel_size=(125,1))
        avg_pool = F.avg_pool2d(x, kernel_size=(125,1))
        x = max_pool + avg_pool
        x = self.fc(x.view(-1, 256))
        return x 




In [None]:
# Fake spectrogram 128x513
x = torch.randn(1, 1, 128, 513)

# Create model
model = NNET2()


# Forward pass
output = model(x)

# Print output shape
print(output.shape)