In [39]:
import numpy as np
import os
import librosa
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

from sklearn.preprocessing import StandardScaler

#in conda environment:
#*. Open appropriate anaconda prompt(according to environment) and try following commands inorder,

#First, conda install -c numba numba

#Then, conda install -c conda-forge librosa

In [40]:
#list the files
filelist_train = os.listdir('Train\Sound') 
filelist_test = os.listdir('Test\Sound') 

#read them into pandas
train_df = pd.DataFrame(filelist_train)
test_df = pd.DataFrame(filelist_test)
val_df = pd.DataFrame(filelist_train)

In [41]:
# Renaming the column name to file
train_df = train_df.rename(columns={0:'file'})
test_df = test_df.rename(columns={0:'file'})
val_df  = val_df.rename(columns={0:'file'})

# Code in case we have to drop the '.DS_Store' and reset the index
train_df[train_df['file']=='.DS_Store']
train_df.drop(16, inplace=True)
train_df = train_df.sample(frac=1).reset_index(drop=True)

test_df[test_df['file']=='.DS_Store']
test_df.drop(16, inplace=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

val_df[val_df['file']=='.DS_Store']
val_df.drop(16, inplace=True)
val_df = val_df.sample(frac=1).reset_index(drop=True)

In [42]:
# We create an empty list where we will append all the speakers ids for each row of our dataframe by slicing the file name since we know the id is the first number before the hash
speaker = []
for i in range(0, len(train_df)):
    speaker.append(train_df['file'][i].split('-')[0])
# We now assign the speaker to a new column 
train_df['speaker'] = speaker

speaker = []
for i in range(0, len(test_df)):
    speaker.append(test_df['file'][i].split('-')[0])
# We now assign the speaker to a new column 
test_df['speaker'] = speaker

speaker = []
for i in range(0, len(val_df)):
    speaker.append(val_df['file'][i].split('-')[0])
# We now assign the speaker to a new column 
val_df['speaker'] = speaker

In [43]:
print(train_df.head())
print(test_df.head())
print(val_df.head())



                    file speaker
0   4831-18525-0002.flac    4831
1   6123-59150-0003.flac    6123
2  3663-172005-0000.flac    3663
3   4570-14911-0001.flac    4570
4   116-288045-0000.flac     116
                    file speaker
0   5442-32873-0001.flac    5442
1   7902-96591-0000.flac    7902
2  8188-269288-0002.flac    8188
3  3997-180294-0000.flac    3997
4   6432-63722-0002.flac    6432
                    file speaker
0   4831-18525-0002.flac    4831
1  1686-142278-0003.flac    1686
2   5543-27761-0000.flac    5543
3   5849-50873-0002.flac    5849
4   4570-14911-0003.flac    4570


In [35]:
def extract_features(files):
    # Sets the name to be the path to where the file is in my computer
    file_name = os.path.join(os.path.abspath('Train\Sound')+'/'+str(files.file))
    # Loads the audio file as a floating point time series and assigns the default sample rate
    # Sample rate is set to 22050 by default
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    # Generate Mel-frequency cepstral coefficients (MFCCs) from a time series 
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    # Generates a Short-time Fourier transform (STFT) to use in the chroma_stft
    stft = np.abs(librosa.stft(X))
        # Computes a chromagram from a waveform or power spectrogram.
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    # Computes a mel-scaled spectrogram.
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    # Computes spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    # Computes the tonal centroid features (tonnetz)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
    sr=sample_rate).T,axis=0)
    return (mfccs, chroma, mel, contrast, tonnetz)

In [36]:
train_features = train_df.apply(extract_features, axis=1)

features_train = []
for i in range(0, len(train_features)):
    features_train.append(np.concatenate((
        train_features[i][0],
        train_features[i][1], 
        train_features[i][2], 
        train_features[i][3],
        train_features[i][4]), axis=0))

X_train = np.array(features_train)

 -0.00015948] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  0.00079711] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
 -0.00027445] as keyword args. From version 0.10 passing these as

In [37]:
val_features = val_df.apply(extract_features, axis=1)

features_val = []
for i in range(0, len(val_features)):
    features_val.append(np.concatenate((
        val_features[i][0],
        val_features[i][1], 
        val_features[i][2], 
        val_features[i][3],
        val_features[i][4]), axis=0))

X_val = np.array(features_val)

  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  0.        ] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
 -7.3263953e-07  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  6.7570909e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  1.1889282e-04  1.1200479e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  0.       

In [44]:
test_features = test_df.apply(extract_features, axis=1)

features_test = []
for i in range(0, len(test_features)):
    features_test.append(np.concatenate((
        test_features[i][0],
        test_features[i][1], 
        test_features[i][2], 
        test_features[i][3],
        test_features[i][4]), axis=0))

X_test = np.array(features_test)

  return f(*args, **kwargs)


FileNotFoundError: [Errno 2] No such file or directory: 'e:\\skola\\2021-2022\\LP3\\D7041E\\D7041E\\project\\Train\\Sound/5442-32873-0001.flac'

In [None]:
X_train = np.array(features_train)
y_train = np.array(train_df['speaker'])
y_val = np.array(val_df['speaker'])

In [None]:

# Hot encoding y
lb = LabelEncoder()
y_train = to_categorical(lb.fit_transform(y_train))
y_val = to_categorical(lb.fit_transform(y_val))

In [None]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_val = ss.transform(X_val)
X_test = ss.transform(X_test)