<a href="https://colab.research.google.com/github/CodingTomo/TE-UrbanSound8K/blob/master/MFCC_extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install torchaudio

In [None]:
import os
import pandas as pd
import torchaudio
import torch
import json
import numpy as np
import librosa

In [None]:
metadata = pd.read_csv('/content/drive/Shared drives/TE-UrbanSound/UrbanSound8K/metadata/UrbanSound8K.csv')
SAMPLE_RATE = 8000
MELKWARGS = {'n_fft': 512, 'hop_length': 128}
N_MFCC = 13
N_WAVEFORM = metadata.shape[0]

In [None]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [None]:
def stereo_to_mono(waveform):
    waveform_mono = torch.mean(waveform, dim=0, keepdim=True)
    return waveform_mono

In [None]:
def resampling_8k(waveform, sr, SAMPLE_RATE):
    waveform = torchaudio.transforms.Resample(new_freq=SAMPLE_RATE, orig_freq=sr)(waveform)
    return waveform


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
with torch.cuda.device(0):
  data = {'name': [],
          'mfcc': [],
          'label': []
          }
  i=0

  for root, dirs, files in os.walk('/content/drive/Shared drives/TE-UrbanSound/UrbanSound8K'):
      for name in files:
          if name.endswith(".wav"):
              complete_path = root + "/" + name
              waveform, sr = torchaudio.load(complete_path)
              waveform.to(device)
              if waveform.shape[0] != 1:
                  waveform = stereo_to_mono(waveform)
              if sr != SAMPLE_RATE:
                  waveform = resampling_8k(waveform, sr, SAMPLE_RATE)
              if waveform.shape[1]==SAMPLE_RATE*4:
                feature = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE,
                                                    n_mfcc=N_MFCC,
                                                    melkwargs=MELKWARGS)(waveform)
                feature=feature.squeeze().numpy()
                data['name'].append(name)
                data['mfcc'].append(feature)
                data['label'].append(metadata.loc[metadata['slice_file_name'] == name]['classID'].iloc[0])
                i=i+1
                if i%100==0:
                  print('-'*40)
                  print('Processed {:.0%}'.format(i/N_WAVEFORM))

In [None]:
with open('/content/drive/Shared drives/TE-UrbanSound/UrbanSound8K/MFCC.json', 'w') as fp:
    json.dump(data, fp, cls=NpEncoder, indent=4)

In [None]:
data = {'name': [],
        'MFCC': [],
        'label': []
        }
i=0

for root, dirs, files in os.walk('/content/drive/Shared drives/TE-UrbanSound/UrbanSound8K'):
    for name in files:
        if name.endswith(".wav"):
            complete_path = root + "/" + name
            waveform, sr = librosa.load(complete_path, sr=SAMPLE_RATE, mono=True)
            if waveform.shape[0]==SAMPLE_RATE*4:
              feature = librosa.feature.mfcc(waveform, 
                                                      sr=sr, 
                                                      n_fft=MELKWARGS['n_fft'], 
                                                      hop_length=MELKWARGS['hop_length'],
                                                      n_mfcc=N_MFCC)
              delta_feature = librosa.feature.delta(feature)
              delta_delta_feature = librosa.feature.delta(feature, order=2)
              all_feature = np.concatenate((feature, delta_feature, delta_delta_feature))
              data['name'].append(name)
              data['MFCC'].append(all_feature)
              data['label'].append(metadata.loc[metadata['slice_file_name'] == name]['classID'].iloc[0])
              i=i+1
              if i%100==0:
                print('-'*40)
                print('Processed {:.0%}'.format(i/N_WAVEFORM))