In [13]:
import os
import torch
import librosa
import torchaudio
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torchaudio.transforms as T
from sklearn.utils import shuffle
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split

In [2]:
torch.manual_seed(42)
Dataset_x=torch.tensor([])
Dataset_y=torch.tensor([])

mel_spec_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate=44100,
    n_mels=100,
    n_fft=1024,
    hop_length=256,
    f_min=0,
    f_max=15000
)

major_path="./Audio_Files/Major/"
for filename in os.listdir(major_path):
    f = os.path.join(major_path, filename)
    waveform,sr=torchaudio.load(f)
    #waveform = torchaudio.functional.lowpass_biquad(waveform,sr,5000)
    waveform=mel_spec_transform(waveform)
    waveform=torch.nn.functional.pad(waveform, (0, 400-waveform.shape[2]), mode='constant', value=0)
    Dataset_x=torch.cat((Dataset_x,waveform))
    Dataset_y=torch.cat((Dataset_y,torch.tensor([[1]])))

minor_path="./Audio_Files/Minor/"
for filename in os.listdir(minor_path):
    f = os.path.join(minor_path, filename)
    waveform,sr=torchaudio.load(f)
    #waveform = torchaudio.functional.lowpass_biquad(waveform,sr,5000)
    waveform=mel_spec_transform(waveform)
    waveform=torch.nn.functional.pad(waveform, (0, 400-waveform.shape[2]), mode='constant', value=0)
    Dataset_x=torch.cat((Dataset_x,waveform))
    Dataset_y=torch.cat((Dataset_y,torch.tensor([[0]])))
Dataset_x,Dataset_y=shuffle(Dataset_x,Dataset_y)

In [3]:
train_x, rem_x, train_y,rem_y = train_test_split(Dataset_x,Dataset_y, train_size=0.7,shuffle=True)
val_x,test_x,val_y,test_y=train_test_split(rem_x,rem_y, train_size=0.5,shuffle=True)
train_x = train_x.unsqueeze(1)
val_x = val_x.unsqueeze(1)

Time_transform=torchaudio.transforms.TimeMasking(time_mask_param=2)
Freq_transform=torchaudio.transforms.FrequencyMasking(freq_mask_param=5)
Volume_transform=torchaudio.transforms.Vol(gain=0.5)

for i in range(len(Dataset_x)):
    a=Time_transform(Dataset_x[i].unsqueeze(0))
    b=Freq_transform(Dataset_x[i].unsqueeze(0))
    c=Volume_transform(Dataset_x[i].unsqueeze(0))

    train_x=torch.cat((train_x,a.unsqueeze(1)),dim=0)
    train_y=torch.cat((train_y,Dataset_y[i].unsqueeze(0)),dim=0)

    train_x=torch.cat((train_x,b.unsqueeze(1)),dim=0)
    train_y=torch.cat((train_y,Dataset_y[i].unsqueeze(0)),dim=0)

    train_x=torch.cat((train_x,c.unsqueeze(1)),dim=0)
    train_y=torch.cat((train_y,Dataset_y[i].unsqueeze(0)),dim=0)

train_x,train_y=shuffle(train_x,train_y)
mean = train_x.mean()
std = train_x.std()
train_x = (train_x - mean) / std
val_x = (val_x - mean) / std

train_x.shape


torch.Size([3178, 1, 100, 400])

In [19]:
def weight_init(m):
    if isinstance(m,nn.Conv2d):
        torch.nn.init.kaiming_uniform_(m.weight)
        m.bias.data.fill_(0)
    if isinstance(m,nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0)

class CNN(nn.Module):
    def __init__(self,shape):
        super(CNN,self).__init__()
        self.conv1=nn.Sequential(
            nn.Conv2d(
            in_channels=1,
            out_channels=32,
            kernel_size=5,
            padding=3,
            stride=2
            ),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3),
            nn.Dropout(p=0.5)
        )
        self.conv1.apply(weight_init)
        self.conv2=nn.Sequential(
            nn.Conv2d(
            in_channels=32,
            out_channels=64,
            kernel_size=5,
            padding=3,
            stride=2
            ),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3),
            nn.Dropout(p=0.5)
        )
        self.conv2.apply(weight_init)
        self.conv3=nn.Sequential(
            nn.Conv2d(
            in_channels=64,
            out_channels=128,
            kernel_size=5,
            padding=3,
            stride=2
            ),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=3),
            nn.Dropout(p=0.5)
        )
        self.conv4=nn.Sequential(
            nn.Conv2d(
            in_channels=128,
            out_channels=128,
            kernel_size=5,
            padding=3,
            stride=2
            ),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(p=0.5)
        )
        self.conv4.apply(weight_init)
        conv_size=self._get_conv_out(shape)
        self.linear1=nn.Linear(conv_size,128)
        self.linear1.apply(weight_init)
        self.linear2=nn.Linear(128,1)
        self.linear2.apply(weight_init)

    def _get_conv_out(self,shape):
        out = self.conv1(torch.zeros(1,*shape))
        out=self.conv2(out)
        out=self.conv3(out)
        out=self.conv4(out)
        result = int(np.prod(out.size()))
        return result
    
    def forward(self,x):
        x=self.conv1(x)
        x=self.conv2(x)
        x=self.conv3(x)
        x=self.conv4(x)
        x=x.view(x.size(0),-1)
        x=self.linear1(x)
        x=self.linear2(x)
        x=torch.sigmoid(x)
        return x
    
    


In [23]:
epochs=20
batch=227
device=torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
model=CNN(train_x[0].shape)
model=model.to(device)
criterion=nn.BCELoss()
optimizer=optim.Adam(model.parameters(),lr=0.0003,weight_decay=10e-4)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

train_loss=[1]
train_accu=[0]
val_loss=[1]
val_accu=[0]
for epoch in range(epochs):
    l=0
    a=0
    batch_num=0
    for i in range(0,len(train_x),batch): 
        model.train()
        batch_num+=1
        x_batch = train_x[i:i+batch] 
        y_batch = train_y[i:i+batch]
        out=model(x_batch)
        loss=criterion(out,y_batch)
        l+=loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        a+=(torch.round(out)==y_batch).sum().float()
    scheduler.step()
    train_loss.append(l/batch_num)#.detach().numpy()
    train_accu.append(a/len(train_x))#.detach().numpy()
    
    model.eval()
    with torch.no_grad():
        out=model(val_x)
        loss=criterion(out,val_y)
        val_accu.append((torch.round(out)==val_y).sum().float()/len(val_x))#.detach().numpy()
        val_loss.append(loss.item()/len(val_x))#.detach().numpy()
    print(train_loss[-1],train_accu[-1],val_loss[-1],val_accu[-1])
plt.plot(range(epochs+1),train_loss)
plt.plot(range(epochs+1),val_loss)
plt.legend(['Train Loss','Test Loss'])
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Train & Test Loss')
plt.show()

plt.plot(range(epochs+1),train_accu)
plt.plot(range(epochs+1),val_accu)
plt.legend(['Train Accuracy','Test Accuracy'])
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Train & Test Accuracy')
plt.show()


0.9605029310498919 tensor(0.5242) 0.0052399450494337455 tensor(0.5581)
0.8628515388284411 tensor(0.5321) 0.005289477895396625 tensor(0.5581)
0.786099123103278 tensor(0.5400) 0.0052813961524371955 tensor(0.5581)
0.7525622163500104 tensor(0.5387) 0.005305258802665296 tensor(0.5581)
0.7242042124271393 tensor(0.5472) 0.005314990069515022 tensor(0.5581)


KeyboardInterrupt: 

In [None]:
"""^ Run Above ^"""

In [None]:
librosa.display.specshow(Dataset_x[2].numpy(), sr=sr, x_axis='time')

In [None]:


def get_mfcc(file_path,sr=44100,fmax=5000):
    wav_data , sr = librosa.load(path=file_path,sr=sr,duration=4)
    wav_data=normalize(wav_data)
    mfcc=librosa.feature.mfcc(y=wav_data, sr=sr,fmax=fmax)
    return mfcc[0:20,0:190],sr
    

def loadedata(data_path):
    major_path=data_path+'Major/'
    minor_path=data_path+'Minor/'
    x_data=[]
    y_data=[]
    for filename in os.listdir(minor_path):
        f = os.path.join(minor_path, filename)
        if os.path.isfile(f):
            mfcc,sr=get_mfcc(f)
            print(mfcc.shape)
            #librosa.display.specshow(mfcc, sr=sr, x_axis='time')
            #plt.show()

class WAVDataset(Dataset):
    def __init__(self):
        self.data_path='./Audio_Files/'
        file_list=glob.glob(self.data_path+"*")
        print(file_list)

In [None]:
data_path='./Audio_Files/'
loadedata(data_path)

In [None]:
audio_path = './Audio_Files/Major/Major_0.wav'
x , sr = librosa.load(audio_path,sr=44100)
print(type(x), type(sr))

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveshow(x, sr=sr)
plt.show()

In [None]:
fft = np.fft.fft(x)
spectrum = np.abs(fft)
left_spectrum = spectrum[:int(len(spectrum)/2)]
f = np.linspace(0, sr, len(spectrum))
left_f = f[:int(len(spectrum)/2)]
# plot spectrum
plt.figure(figsize=(14, 5))
plt.plot(left_f, left_spectrum, alpha=0.4)
plt.xlabel("Freq")
plt.ylabel("Mag")
plt.show()

In [None]:
#display Spectrogram
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
print(Xdb)
plt.figure(figsize=(14, 5))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz') 
#If to pring log of frequencies  
#librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
plt.colorbar()

In [None]:
mfccs = librosa.feature.mfcc(y=x, sr=sr,fmax=5000)
print(mfccs.shape)
#Displaying  the MFCCs:
librosa.display.specshow(mfccs, sr=sr, x_axis='time')

In [None]:
from troch