In [2]:
# general imports
import os
import librosa
import numpy as np
import pandas as pd
import IPython.display as ipd 

# encodimg import
from sklearn.preprocessing import LabelEncoder

# model imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense , Activation , Dropout

# evaluation metrics imports
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# models imports
import xgboost
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import neural_network

# visualization imports
import wave
from tqdm import tqdm
import seaborn as sns
import librosa.display
import matplotlib.pyplot as plt

In [4]:
# Set up working data directory
data_dir='../input/newerageset/AUDIOAZE/data'
text_file= "../input/newerageset/AUDIOAZE/metadata.txt"
# open the text file 
f = pd.read_csv(text_file)

print(f)

In [5]:
folder_ids=list(range(1,365))
age = f.Age
folder_dict_age=dict(zip(folder_ids,age))
print(folder_dict_age)

In [6]:
folder_id=list(range(1,365))
gender = f.Gender
folder_dict=dict(zip(folder_id,gender))
print(folder_dict)

In [7]:
print("---Age and Gender statistics---")
print("Number of Female speakers: "+ str(len(f[f['Gender'] == 'F'])))
print("Number of Male speakers: "+ str(len(f[f['Gender'] == 'M'])))
print("Number of Teenager speakers: "+ str(len(f[f['Age'] == 'teenager'])))
print("Number of Young aged speakers: "+ str(len(f[f['Age'] == 'young aged'])))
print("Number of Middle aged speakers: "+ str(len(f[f['Age'] == 'middle aged'])))
print("Number of Senior speakers: "+ str(len(f[f['Age'] == 'senior'])))

In [9]:
# save file names
file_names=[]

# save all gender of file names
gender_vals=[]

# save all age of file names
age_vals=[]

# save all file id and gender
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        file_origin=filename.split("_")[1]
        if file_origin=='sk':
            pass
        elif file_origin=='ar':
            pass
        elif file_origin=='c0':
            pass
        elif file_origin=='c1':
            pass
        elif file_origin=='c6':
            pass
        elif file_origin=='c7':
            pass
        elif file_origin=='c9':
            pass
        elif file_origin=='c8':
            pass
        elif file_origin=='mp':
            pass
        elif file_origin=='ma':
            pass
        elif file_origin=='le':
            pass
        elif file_origin==', ':
            pass
        elif file_origin=='na':
            pass
        elif int(file_origin) in folder_dict:
            file_names.append(filename)
            gender_vals.append(folder_dict[int(file_origin)])
            age_vals.append(folder_dict_age[int(file_origin)])
        else:
            pass

In [11]:
# create na new dataframe
meta_data_gender = pd.DataFrame(gender_vals,columns=['class'])
meta_data_gender["file_name"]=file_names

In [13]:
# create na new dataframe
meta_data_age = pd.DataFrame(age_vals,columns=['class'])
meta_data_age["file_name"]=file_names

In [14]:
# do feature extraction using librosa
def features_extract(file):
    # load the audio file
    audio,sample_rate = librosa.load(file,res_type='kaiser_fast')
    #y, sr = librosa.load(file_name, mono=True, duration=30)
    # extract the features
    feature = librosa.feature.mfcc(y=audio,sr=sample_rate,n_mfcc=50)
    #rmse_feature = librosa.feature.rms(y=y)
    
    # feature scaling
    scaled_feature = np.mean(feature.T,axis=0)
    
    # return the scaled features
    return scaled_feature


In [15]:
def gender_estimation():
    extracted = []
    # for each row in the csv
    for index_num,row in tqdm(meta_data_age.iterrows()):
        ids = row['file_name'].split("_")[1]
        # get the file 
        file_name = os.path.join(os.path.abspath(data_dir),ids +'/',str(row['file_name']))
        # get file label
        final_class_labels = row['class']
        # extract feature
        data= features_extract(file_name)
        # store it in a list
        extracted.append([data,final_class_labels])
    return extracted
    

In [16]:
def age_estimation():
    # list containg all the features
    extracted = []
    # for each row in the csv
    for index_num,row in tqdm(meta_data_age.iterrows()):
        ids = row['file_name'].split("_")[1]
        # get the file 
        file_name = os.path.join(os.path.abspath(data_dir),ids +'/',str(row['file_name']))
        # get file label
        final_class_labels = row['class']
        # extract feature
        data= features_extract(file_name)
    
        # store it in a list
        extracted.append([data,final_class_labels])
    return extracted

In [17]:
# create na new dataframe
extracted_gender = gender_estimation()
extracted_df_gen = pd.DataFrame(extracted_gender,columns=['feature','class'])

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
# create na new dataframe
extracted_age = age_estimation()
extracted_df_age = pd.DataFrame(extracted_age,columns=['feature','class'])

In [19]:
# display first fivve rows of the dataframe
extracted_df_age.head()

In [22]:
# display first fivve rows of the dataframe
extracted_df_gen.head()

In [23]:
# get the data as a list and send it to np.array() 
# function to convert it into an array 
x = np.array(extracted_df_gen['feature'].tolist())
y = np.array(extracted_df_gen['class'].tolist())

In [24]:
# get the data as a list and send it to np.array() 
# function to convert it into an array 
x1= np.array(extracted_df_age['feature'].tolist())
y1 = np.array(extracted_df_age['class'].tolist())

In [28]:
# label encoding to get encoding
le = LabelEncoder()

# transform each category with it's respected label
Y = to_categorical(le.fit_transform(y))

In [29]:
# label encoding to get encoding
le = LabelEncoder()

# transform each category with it's respected label
Y1 = to_categorical(le.fit_transform(y1))

In [30]:
# split the data to train and test set
x_train, x_test, y_train, y_test = train_test_split(x, Y, test_size=0.2, random_state = 42)

# print the details
print("Number of training samples = ", x_train.shape[0])
print("Number of testing samples = ",x_test.shape[0])

In [31]:
# split the data to train and test set
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, Y1, test_size=0.2, random_state = 42)

# print the details
print("Number of training samples = ", x_train1.shape[0])
print("Number of testing samples = ",x_test1.shape[0])

# MODELS

In [32]:
num_labels = Y.shape[1]
model = Sequential()

model.add(Dense(256, input_shape=(50,))) 
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(256))
model.add(Activation('relu')) 
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(128))

model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()

In [33]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy'],
)

In [34]:
num_epochs = 15
num_batch_size = 32

model.fit(
          x_train, 
          y_train, 
          batch_size=num_batch_size, 
          epochs=num_epochs,
          validation_data=(x_test, y_test),
         )

In [35]:
# function to extract features from the audion file
def extract_feature(file_name):
    # load the audio file
    audio_data, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    #y, sr = librosa.load(file_name, mono=True, duration=30)
    
    # get the feature 
    feature = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=50)
    #rmse_feature = librosa.feature.rms(y=y)
    # scale the features
    feature_scaled = np.mean(feature.T,axis=0)
    
    # return the array of features
    return np.array([feature_scaled])

In [36]:
y, sr = librosa.load('../input/farah25/25 female.wav')
rmse = librosa.feature.rms(y=y)
# Displaying  the MFCCs:
plt.figure(figsize=(15, 3))
librosa.display.specshow(rmse, sr=sr, x_axis='time')

In [None]:
def extract_features(file_name):
    y, sr = librosa.load(file_name, mono=True, duration=30)
    rmse = librosa.feature.rmse(y=y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
    for e in mfcc:
        to_append += f' {np.mean(e)}'
    to_append += f' {g}'
    file = open('dataset.csv', 'a', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(to_append.split())

In [None]:
x, sr = librosa.load('Downloads/Action-Rock.wav')
hop_length = 512
chromagram = librosa.feature.chroma_stft(x, sr=sr, hop_length=hop_length)
fig, ax = plt.subplots(figsize=(15, 3))
img = librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')
fig.colorbar(img, ax=ax)

In [38]:
# function to predict the feature
def print_prediction(file_name):
    
    # extract feature from the function defined above
    prediction_feature = extract_feature(file_name) 
    
    # get the id of label using argmax
    predicted_vector = np.argmax(model.predict(prediction_feature), axis=-1)
    
    # get the class label from class id
    predicted_class = le.inverse_transform(predicted_vector)
    
    # display the result
    print("The predicted class is:", predicted_class[0], '\n') 

In [42]:
# File name
file_name = "../input/ushkok/kamranaushko.wav"

# get the output
print_prediction(file_name)

# play the file
ipd.Audio(file_name)

In [57]:
def classify(model,x_train,y_train,x_test,y_test):
    from sklearn.metrics import classification_report
    target_names = ['teenager', 'young aged', 'middle aged', 'senior']
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(classification_report(y_test, y_pred, target_names=target_names, digits=4))
    print("Accuracy: "+ str(accuracy_score(y_test, y_pred)))

In [58]:
def knn_error(k,x_train,y_train,x_test,y_test):
    error_rate = []
    K=range(1,k)
    for i in K:
        knn = neighbors.KNeighborsClassifier(n_neighbors = i)
        knn.fit(x_train, y_train)
        y_pred = knn.predict(x_test)
        error_rate.append(np.mean(y_pred != y_test))
    kloc = error_rate.index(min(error_rate))
    print("Lowest error is %s occurs at k=%s." % (error_rate[kloc], K[kloc]))

    plt.plot(K, error_rate, color='blue', linestyle='dashed', marker='o',
             markerfacecolor='red', markersize=10)
    plt.title('Error Rate vs. K Value')
    plt.xlabel('K')
    plt.ylabel('Error Rate')
    plt.show()
    return K[kloc]

In [59]:
k=knn_error(21,x_train,y_train,x_test,y_test)

In [60]:
model = neighbors.KNeighborsClassifier(n_neighbors = k)
classify(model,x_train,y_train,x_test,y_test)

In [61]:
#Find the best parameter to prune the tree
def dt_error(n,x_train,y_train,x_test,y_test):
    nodes = range(2, n)
    error_rate = []
    for k in nodes:
        model = tree.DecisionTreeClassifier(max_leaf_nodes=k)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        error_rate.append(np.mean(y_pred != y_test))
    kloc = error_rate.index(min(error_rate))
    print("Lowest error is %s occurs at n=%s." % (error_rate[kloc], nodes[kloc]))
    plt.plot(nodes, error_rate, color='blue', linestyle='dashed', marker='o',
             markerfacecolor='red', markersize=10)
    plt.xlabel('Tree Size')
    plt.ylabel('Cross-Validated MSE')
    plt.show()
    return nodes[kloc]

In [62]:
n=dt_error(10,x_train,y_train,x_test,y_test)

In [63]:
#prune tree
pruned_tree = tree.DecisionTreeClassifier(criterion = 'gini', max_leaf_nodes = n)
classify(pruned_tree,x_train,y_train,x_test,y_test)

In [64]:
def rf_error(n,x_train,y_train,x_test,y_test):
    error_rate = []
    e=range(1,n,20)
    for i in e:
        model = ensemble.RandomForestClassifier(n_estimators = i)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        error_rate.append(np.mean(y_pred != y_test))
    nloc = error_rate.index(min(error_rate))
    print("Lowest error is %s occurs at n=%s." % (error_rate[nloc], e[nloc]))

    plt.plot(e, error_rate, color='blue', linestyle='dashed', marker='o',
             markerfacecolor='red', markersize=10)
    plt.title('Error Rate vs. n Value')
    plt.xlabel('n')
    plt.ylabel('Error Rate')
    plt.show()
    return e[nloc]

In [65]:
e=rf_error(100,x_train,y_train,x_test,y_test)

In [66]:
model=ensemble.RandomForestClassifier(n_estimators = e)
classify(model,x_train,y_train,x_test,y_test)

In [67]:
def nn_error(n,x_train,y_train,x_test,y_test):
    error_rate = []
    hidden_layer=range(1,n)
    for i in hidden_layer:
        model = neural_network.MLPClassifier(solver='adam', alpha=1e-5,
                                       hidden_layer_sizes=i,
                                       activation='logistic',random_state=17,
                                       max_iter=2000)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        error_rate.append(np.mean(y_pred != y_test))
    kloc = error_rate.index(min(error_rate))
    print("Lowest error is %s occurs at C=%s." % (error_rate[kloc], hidden_layer[kloc]))

    plt.plot(hidden_layer, error_rate, color='blue', linestyle='dashed', marker='o',
             markerfacecolor='red', markersize=10)
    plt.title('Error Rate vs. Hidden Layer Size')
    plt.xlabel('Size')
    plt.ylabel('Error Rate')
    plt.show()
    return hidden_layer[kloc]

In [68]:
h=nn_error(20,x_train,y_train,x_test,y_test)

In [69]:
model = neural_network.MLPClassifier(solver='adam', alpha=1e-5,
                                       hidden_layer_sizes=h,
                                       activation='logistic',random_state=17,
                                       max_iter=2000)
classify(model,x_train,y_train,x_test,y_test)

In [71]:
# File name
file_name = "../input/ushkok/kamranaushko.wav"

# get the output
print_prediction(file_name)

# play the file
ipd.Audio(file_name)

In [None]:
# Open wav file and read frames as bytes
sf_filewave = wave.open('../input/kamran/kamranushka.wav', 'r')
signal_sf = sf_filewave.readframes(-1)
# Convert audio bytes to integers
soundwave_sf = np.frombuffer(signal_sf, dtype='int16')
# Get the sound wave frame rate
framerate_sf = sf_filewave.getframerate()
# Find the sound wave timestamps
time_sf = np.linspace(start=0,
                      stop=len(soundwave_sf)/framerate_sf,
                      num=len(soundwave_sf))
# Set up plot
f, ax = plt.subplots(figsize=(15, 3))
# Setup the title and axis titles
plt.title('Amplitude over Time')
plt.ylabel('Amplitude')
plt.xlabel('Time (seconds)')
# Add the audio data to the plot
ax[0] = plt.plot(time_sf, soundwave_sf, label='Warm Memories', alpha=0.5)
plt.legend()
plt.show()

In [None]:
# Open wav file and read frames as bytes
sf_filewave = wave.open('../input/farida/farida.wav', 'r')
signal_sf = sf_filewave.readframes(-1)
# Convert audio bytes to integers
soundwave_sf = np.frombuffer(signal_sf, dtype='int16')
# Get the sound wave frame rate
framerate_sf = sf_filewave.getframerate()
# Find the sound wave timestamps
time_sf = np.linspace(start=0,
                      stop=len(soundwave_sf)/framerate_sf,
                      num=len(soundwave_sf))
# Set up plot
f, ax = plt.subplots(figsize=(15, 3))
# Setup the title and axis titles
plt.title('Amplitude over Time')
plt.ylabel('Amplitude')
plt.xlabel('Time (seconds)')
# Add the audio data to the plot
ax[0] = plt.plot(time_sf, soundwave_sf, label='Warm Memories', alpha=0.5)
plt.legend()
plt.show()

In [None]:
x, sr = librosa.load('../input/farida/farida.wav')
# Spectrogram of frequency
X = librosa.stft(x)
Xdb = librosa.amplitude_to_db(abs(X))
plt.figure(figsize=(15, 3))
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
plt.colorbar()

In [None]:
y, sr = librosa.load('../input/farida/farida.wav')
# Get RMS value from each frame's magnitude value
S, phase = librosa.magphase(librosa.stft(y))
rms = librosa.feature.rms(S=S)
# Plot the RMS energy
fig, ax = plt.subplots(figsize=(15, 6), nrows=2, sharex=True)
times = librosa.times_like(rms)
ax[0].semilogy(times, rms[0], label='RMS Energy')
ax[0].set(xticks=[])
ax[0].legend()
ax[0].label_outer()
librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
                         y_axis='log', x_axis='time', ax=ax[1])
ax[1].set(title='log Power spectrogram')

# ANN MODEL

In [None]:
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from PIL import Image
import pathlib
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import keras
from keras import layers
from keras import layers
import keras
from keras.models import Sequential
import warnings
warnings.filterwarnings('ignore')