In [None]:
import os
from glob import glob
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import torch
from torch import nn
from torchsummary import summary
from torch.utils.data import DataLoader
import re
import pandas as pd
import numpy as np


In [None]:
from google.colab import drive
drive.mount('drive')
audio_files = glob('drive/My Drive/AI certification with Zaka/Capstone Project/Data/*/*.mp3')

Mounted at drive


In [None]:
metadata = pd.DataFrame()
labels = []
for i in audio_files:
  labels.append(re.findall("Portuguese|Arabic|Chinese", i))

metadata['filenames'] = audio_files
metadata['labels'] = np.array(labels).flatten()

metadata.to_csv('metadata.csv')

In [None]:
class Language_identification(Dataset):

  def __init__(self, annotations_file, transformation, target_sample_rate, num_samples, device):
    self.annotations = pd.read_csv(annotations_file)
    self.device = device
    self.transformation = transformation.to(self.device)
    self.target_sample_rate = target_sample_rate
    self.num_samples = num_samples

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, index):
    audio_sample_path = self._get_audio_sample_path(index)
    label = self._get_audio_sample_label(index)
    signal, sr = torchaudio.load(audio_sample_path)
    signal = signal.to(self.device)
    signal = self._resample_if_necessary(signal, sr)
    signal = self._mix_down_if_necessary(signal)
    signal = self._cut_if_necessary(signal)
    signal = self._right_pad_if_necessary(signal)
    signal = self.transformation(signal)
    return signal, label

  def _cut_if_necessary(self, signal):
    if signal.shape[1] > self.num_samples:
      signal = signal[:, :self.num_samples]
      return signal

  def _right_pad_if_necessary(self, signal):
    length_signal = signal.shape[1]
    if length_signal < self.num_samples:
      num_missing_samples = self.num_samples - length_signal
      last_dim_padding = (0, num_missing_samples)
      signal = torch.nn.functional.pad(signal, last_dim_padding)
      return signal

  def _get_audio_sample_path(self, index):
    path = self.annotations.iloc[index, 0]
    return path

  def _get_audio_sample_label(self, index):
    return self.annotations.iloc[index, 1]

  def _resample_if_necessary(self, signal, sr):
    if sr != self.target_sample_rate:
      resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
      signal = resampler(signal)
      return signal

  def _mix_down_if_necessary(self, signal):
    if signal.shape[0] > 1:
      signal = torch.mean(signal, dim=0, keepdim=True)
      return signal

In [None]:
class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1, out_channels=48,
                kernel_size=3,
                stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=48, out_channels=64,
                kernel_size=3,
                stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=64, out_channels=128,
                kernel_size=3,
                stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=128, out_channels=128,
                kernel_size=3,
                stride=1, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2))
        
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 4, 3)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [None]:
BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 0.005

ANNOTATIONS_FILE = "drive/My Drive/AI certification with Zaka/Capstone Project/metadata.csv"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050


def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


In [None]:
train_dataloader = DataLoader('drive/My Drive/AI certification with Zaka/Capstone Project/Data/*/*.mp3', batch_size=128)

In [None]:
if __name__ == "__main__":
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")

    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    # instantiating our dataset object and create data loader
    SLID = Language_identification(ANNOTATIONS_FILE, 
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    
    train_dataloader = create_data_loader(SLID, BATCH_SIZE)

    # construct model and assign it to device
    cnn = CNNNetwork().to(device)
    print(cnn)

    # initialise loss funtion + optimiser
    loss_fn = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(cnn.parameters(),
                                 lr=LEARNING_RATE)

    # train model
    train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)

    # save model
    torch.save(cnn.state_dict(), "feedforwardnet.pth")
    print("Trained feed forward net saved at feedforwardnet.pth")

Using cpu
CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 48, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(48, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1


TypeError: ignored

In [None]:
    # instantiating our dataset object and create data loader
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)


usd = Language_identification(ANNOTATIONS_FILE,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            'cpu')



In [None]:
import soundfile
train_dataloader = DataLoader(usd, batch_size=128)
train_dataloader
for input, target in train_dataloader:
    print(input)
    print(target)


TypeError: ignored

In [None]:
import requests 
with requests.get('https://drive.google.com/drive/folders/1yaYgffyWnbfiNbqyJnR17ZJvPIA6P0G3?usp=share_link', stream=True) as response:
     y,sr = torchaudio.load(response.raw, format = 'mp3')

RuntimeError: ignored

In [None]:
import subprocess
subprocess.check_output(['ffmpeg', '-version'])

b'ffmpeg version 3.4.11-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers\nbuilt with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)\nconfiguration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-li

In [None]:
train_dataloader.dataset

<__main__.Language_identification at 0x7f6d474a5b10>

In [None]:
torchaudio.set_audio_backend("soundfile")
