## 1. Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import cv2
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import librosa
from scipy.io import wavfile

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D,BatchNormalization, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [3]:
dir25 = "D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio"
data25 = os.listdir(dir25)
file_dir25 = []
count = 0

for mooddir25 in data25:
    if(not r'.' in mooddir25):
        moodDirName25 = os.path.join(dir25,mooddir25)
        mooddir25=np.array(os.listdir(moodDirName25))
        for audio in mooddir25:
            if(audio[-3:]=='mp3'):
                count+=1
                fn=os.path.join(moodDirName25,audio)
                file_dir25.append(fn)
                print(str(count),fn)

1 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\adana_kopru_basi_murat_kursun.mp3
2 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\ah_nerede_vah_nerede_fusun_onal.mp3
3 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\altun_kardesler_erik_dali.mp3
4 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\antebin_hamamlar_cesitli_sanatci.mp3
5 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\antebin_kalesine_ibrahim_tatlises.mp3
6 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\arap_kizi_ibrahim_tatlises.mp3
7 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\arpa_bugday_daneler_muazzez_ersoy.mp3
8 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\arzu_kus_adana_kopru_basi.mp3
9 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\askimiz_bitecek_cansu_koc.mp3
10 D:/UM/Project/Mozartify/project/Datasets/audio/trimmed_audio\happy\ayas_

## 2. Data Augmentation

In [4]:
#add noise
def add_noise(data):
    wn = np.random.normal(0, 1, len(data))
    return np.where(data != 0.0, data.astype('float64') + 0.02 * wn, 0.0).astype(np.float32)

In [5]:
def time_shift(data, shift):
    # shift：shifting lenth
    return np.roll(data, int(shift))

In [6]:
for file in file_dir25:
    m = os.path.basename(os.path.dirname(file))
    f_noise_dir = os.path.join("D:/UM/Project/Mozartify/project/Datasets/audio/noise", m)
    f_shift_dir = os.path.join("D:/UM/Project/Mozartify/project/Datasets/audio/TimeShift", m)

    # Create directories if they do not exist
    os.makedirs(f_noise_dir, exist_ok=True)
    os.makedirs(f_shift_dir, exist_ok=True)

    f_noise = os.path.join(f_noise_dir, 'noise_' + os.path.basename(file))
    f_shift = os.path.join(f_shift_dir, 'shift_' + os.path.basename(file))

    data, sr = librosa.load(file)
    data_noise = add_noise(data)
    data_shift = time_shift(data, shift=sr)

    wavfile.write(f_noise, sr, data_noise)
    wavfile.write(f_shift, sr, data_shift)

In [7]:
dir_noise = "D:/UM/Project/Mozartify/project/Datasets/audio/Noise/"
data_noise = os.listdir(dir_noise)
file_noise = []
count = 0

for mooddir in data_noise:
    if(not r'.' in mooddir):
        moodDirName = os.path.join(dir_noise,mooddir)
        mooddir=np.array(os.listdir(moodDirName))
        for audio in mooddir:
            if(audio[-3:]=='mp3'):
                count+=1
                fn=os.path.join(moodDirName,audio)
                file_noise.append(fn)

In [8]:
duration_noise = []
for i in range(len(file_noise)):
    y,sr = librosa.load(file_noise[i])
    d = librosa.get_duration(y=y,sr=sr)
    duration_noise.append(d)

In [9]:
for i in range(len(duration_noise)):
    if duration_noise[i]!=25.5:
        print(i,duration_noise[i])

235 10.08
648 9.783990929705215


In [10]:
dir_shift = "D:/UM/Project/Mozartify/project/Datasets/audio/TimeShift/"
data_shift = os.listdir(dir_shift)
file_shift = []

for mooddir in data_shift:
    if(not r'.' in mooddir):
        moodDirName = os.path.join(dir_shift,mooddir)
        mooddir=np.array(os.listdir(moodDirName))
        for audio in mooddir:
            if(audio[-3:]=='mp3'):
                fn=os.path.join(moodDirName,audio)
                file_shift.append(fn)

In [11]:
duration_shift = []
for i in range(len(file_shift)):
    y,sr = librosa.load(file_shift[i])
    d = librosa.get_duration(y=y,sr=sr)
    duration_shift.append(d)

In [12]:
for i in range(len(duration_shift)):
    if duration_shift[i]!=25.5:
        print(i,duration_shift[i])

235 10.08
648 9.783990929705215


## 4. 1D Feature Extraction

In [13]:
import re
def key_scale_finder(file):
    pitches = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']
    y,sr = librosa.load(file)
    key = librosa.feature.chroma_stft(y=y,sr=sr).sum(axis=1).argmax()
    if re.search("\#$",pitches[key]):
        scale = "minor"
    else:
        scale = "major"
    return scale,pitches[key]

def feature_1d(file):
    try:
        y,sr = librosa.load(file)
    except:
        print('No such file')
        quit()
    
    f = [] 
    
    # tempo
    tempo = librosa.beat.tempo(y=y,sr=sr)[0]
    f.append(tempo)  
    
    # RMS
    #S, phase = librosa.magphase(librosa.stft(y))
    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)
    rms_var = np.var(rms)
    f.append(rms_mean)
    f.append(rms_var)
    
    # chroma features
    chroma = librosa.feature.chroma_stft(y=y,sr=sr)
    chroma_mean = np.mean(chroma)
    chroma_var = np.var(chroma)
    f.append(chroma_mean)
    f.append(chroma_var)
    
    # spectral centroid
    centroid = librosa.feature.spectral_centroid(y=y)
    centroid_mean = np.mean(centroid)
    centroid_var = np.var(centroid)
    f.append(centroid_mean)
    f.append(centroid_var)
    
    # spectral rolloff
    rolloff = librosa.feature.spectral_rolloff(y=y+0.01, sr=sr)
    rolloff_mean = np.mean(rolloff)
    rolloff_var = np.var(rolloff)
    f.append(rolloff_mean)
    f.append(rolloff_var)
    
    # zero crossing rate
    zcr = librosa.feature.zero_crossing_rate(y=y)
    zcr_mean = np.mean(zcr)
    zcr_var = np.var(zcr)
    f.append(zcr_mean)
    f.append(zcr_var)
    
    # tonnetz
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    tonnetz_mean = np.mean(tonnetz)
    tonnetz_var  = np.var(tonnetz)
    f.append(tonnetz_mean)
    f.append(tonnetz_var)
    
    #mel
    s = librosa.feature.melspectrogram(y=y, sr=sr)
    mel = librosa.amplitude_to_db(s, ref=np.max)
    mel_mean = np.mean(mel)
    mel_var = np.var(mel)
    f.append(mel_mean)
    f.append(mel_var)
    
    #mfcc
    mfcc = librosa.feature.mfcc(y=y,sr=sr,n_mfcc=20)
    m = len(mfcc)
    mfcc_mean = np.zeros((m,))
    mfcc_var = np.zeros((m,))
    for i in range(len(mfcc)):
        mfcc_mean[i] = np.mean(mfcc[i])
        mfcc_var[i]= np.var(mfcc[i])
        f.append(mfcc_mean[i])
        f.append(mfcc_var[i])
    
    return np.array(f)

In [14]:
#Extract Key and Scale
noise_key = []
noise_scale = []
for i in range(len(file_noise)):
    noise_key.append(key_scale_finder(file_noise[i])[1])
    noise_scale.append(key_scale_finder(file_noise[i])[0])

In [15]:
#Feature Vectors
noise_features = np.zeros((len(file_noise),55))
for i in range(0,len(file_noise)):
    noise_features[i] = feature_1d(file_noise[i])

In [16]:
# Label the mood
mood_noise = []
for i in range(len(file_noise)):
    m = os.path.basename(os.path.dirname(file_noise[i]))
    mood_noise.append(m)

In [17]:
shift_key = []
shift_scale = []
for i in range(len(file_shift)):
    shift_key.append(key_scale_finder(file_shift[i])[1])
    shift_scale.append(key_scale_finder(file_shift[i])[0])

In [18]:
#Feature Vectors
shift_features = np.zeros((len(file_shift),55))
for i in range(0,len(file_shift)):
    shift_features[i] = feature_1d(file_shift[i])

In [19]:
# Label the mood
mood_shift = []
for i in range(len(file_shift)):
    m = os.path.basename(os.path.dirname(file_shift[i]))
    mood_shift.append(m)

In [20]:
# Store 15 features to data frame
col_name = ['tempo','rms_mean','rms_var','chroma_mean','chroma_var',
            'centroid_mean','centroid_var','rolloff_mean','roll_off_var',
            'zcr_mean','zcr_var','tonnetz_mean', 'tonnetz_var', 'mel_mean', 'mel_var']
# Store 20 mfccs mean and var to data frame
mfccs_col_name = []
for i in range(0,20):
    mfccs_col_name.append("mfcc_mean_"+str(i+1))
    mfccs_col_name.append("mfcc_var_"+str(i+1))

In [21]:
# Create a data frame to store all the data
features_noise = pd.DataFrame()

In [22]:
# Store file name
features_noise['file'] = file_noise
# Store key and scale
features_noise['scale'] = noise_scale
features_noise['key'] = noise_key
# Store 13 features and 20 mfccs
for i in range(len(col_name)):
    features_noise[col_name[i]] = noise_features[:,i]
for i in range(len(mfccs_col_name)):
    features_noise[mfccs_col_name[i]] = noise_features[:,i+15]
# Store mood
features_noise['mood'] = mood_noise

In [23]:
features_noise

Unnamed: 0,file,scale,key,tempo,rms_mean,rms_var,chroma_mean,chroma_var,centroid_mean,centroid_var,...,mfcc_var_16,mfcc_mean_17,mfcc_var_17,mfcc_mean_18,mfcc_var_18,mfcc_mean_19,mfcc_var_19,mfcc_mean_20,mfcc_var_20,mood
0,D:/UM/Project/Mozartify/project/Datasets/audio...,major,D,135.999178,0.172551,0.003279,0.346737,0.097540,2966.634004,269376.017250,...,85.576714,-1.930645,56.159031,-0.892971,51.551453,0.633405,47.186474,0.253610,38.859661,happy
1,D:/UM/Project/Mozartify/project/Datasets/audio...,major,E,107.666016,0.108430,0.000697,0.425311,0.088677,3245.481036,184365.866922,...,27.379467,-5.753342,32.791855,-4.232408,35.934994,-1.325320,49.086235,-1.749682,67.460434,happy
2,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,99.384014,0.242929,0.010846,0.438916,0.083680,3349.734719,592307.581520,...,59.253696,-4.593424,57.105923,-2.326811,42.335182,-3.384727,42.804070,-0.737544,61.920296,happy
3,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,107.666016,0.122461,0.002094,0.429079,0.077916,3441.083045,165997.534587,...,29.263926,-2.092536,28.378841,-3.750346,26.634878,-5.963063,27.701960,-6.120047,26.094809,happy
4,D:/UM/Project/Mozartify/project/Datasets/audio...,major,E,129.199219,0.194006,0.003910,0.276141,0.085043,2714.529319,122088.557930,...,22.606628,-7.715428,26.561018,-3.081796,38.821686,-1.843145,54.758949,4.226146,58.505424,happy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,151.999081,0.099292,0.000553,0.302301,0.086560,3586.618854,118513.526362,...,25.648636,-11.969786,31.615791,-8.434127,28.832972,-11.089932,23.921122,-12.584082,28.737524,sad
969,D:/UM/Project/Mozartify/project/Datasets/audio...,minor,G#,89.102909,0.086727,0.001535,0.288060,0.082760,3649.597341,268761.677383,...,24.422441,-10.237507,20.169052,-8.180056,21.755917,-9.295341,18.762970,-10.592243,30.263374,sad
970,D:/UM/Project/Mozartify/project/Datasets/audio...,major,E,99.384014,0.079354,0.000271,0.249582,0.087047,3687.129803,88172.109414,...,22.693632,-1.819579,22.148537,-2.206744,27.014732,1.092161,37.564590,2.067389,34.046097,sad
971,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,99.384014,0.074790,0.000973,0.291124,0.089676,3945.450852,272248.199393,...,26.464180,-9.677834,28.049032,-9.241619,24.549885,-9.959146,28.012188,-10.346671,40.555420,sad


In [24]:
# Create the Excel file
pd.ExcelWriter('D:/UM/Project/Mozartify/project/Datasets/Features/Features1D/Features_noise.xlsx')

<pandas.io.excel._xlsxwriter.XlsxWriter at 0x28510a51b80>

In [25]:
# Store the Feature into excel file
features_noise.to_excel('D:/UM/Project/Mozartify/project/Datasets/Features/Features1D/Features_noise.xlsx')

In [26]:
# Create a data frame to store all the data
features_shift = pd.DataFrame()

In [27]:
# Store file name
features_shift['file'] = file_shift
# Store key and scale
features_shift['scale'] = shift_scale
features_shift['key'] = shift_key
# Store 13 features and 20 mfccs
for i in range(len(col_name)):
    features_shift[col_name[i]] = shift_features[:,i]
for i in range(len(mfccs_col_name)):
    features_shift[mfccs_col_name[i]] = shift_features[:,i+15]
# Store mood
features_shift['mood'] = mood_shift

In [28]:
features_shift

Unnamed: 0,file,scale,key,tempo,rms_mean,rms_var,chroma_mean,chroma_var,centroid_mean,centroid_var,...,mfcc_var_16,mfcc_mean_17,mfcc_var_17,mfcc_mean_18,mfcc_var_18,mfcc_mean_19,mfcc_var_19,mfcc_mean_20,mfcc_var_20,mood
0,D:/UM/Project/Mozartify/project/Datasets/audio...,major,D,135.999178,0.171162,0.003351,0.340845,0.097614,2289.203108,3.396879e+05,...,97.515526,-3.539253,71.415993,-0.424872,66.437180,-0.199554,59.399525,0.886875,47.189739,happy
1,D:/UM/Project/Mozartify/project/Datasets/audio...,major,E,107.666016,0.106434,0.000724,0.414449,0.089709,2128.873013,3.900859e+05,...,37.903477,-9.216166,46.307808,-1.430186,50.143211,-3.562091,65.681473,1.210580,85.589287,happy
2,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,99.384014,0.241991,0.010835,0.435087,0.083844,2979.436280,1.126172e+06,...,67.475204,-6.002207,66.501785,-1.109421,48.929623,-4.635880,50.368835,0.668790,79.389671,happy
3,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,112.347147,0.120569,0.002160,0.421653,0.079912,2884.974018,1.293321e+05,...,33.690018,-4.149460,37.224216,-3.795806,32.814350,-8.422532,32.291790,-6.022736,30.629898,happy
4,D:/UM/Project/Mozartify/project/Datasets/audio...,major,E,129.199219,0.192869,0.003973,0.272747,0.084913,2147.226175,1.973454e+05,...,28.748222,-10.841578,36.262440,-0.469117,47.630962,-4.169499,60.776985,6.336962,64.402382,happy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,151.999081,0.097108,0.000583,0.294316,0.088250,1996.638353,5.530875e+05,...,52.284241,-18.933998,71.058098,-6.168517,61.867592,-16.978498,55.321728,-14.182650,64.533455,sad
969,D:/UM/Project/Mozartify/project/Datasets/audio...,minor,G#,89.102909,0.083857,0.001617,0.272484,0.085090,1331.675571,1.541154e+05,...,37.903763,-11.966993,43.538395,-5.346477,40.368793,-10.921440,37.538857,-15.324995,56.659206,sad
970,D:/UM/Project/Mozartify/project/Datasets/audio...,major,E,99.384014,0.076663,0.000291,0.237296,0.089185,1681.076015,2.767160e+05,...,46.474285,-5.677488,40.483074,-3.268615,47.944653,1.989569,70.061569,5.450746,69.814407,sad
971,D:/UM/Project/Mozartify/project/Datasets/audio...,major,A,99.384014,0.071239,0.001091,0.263241,0.092150,2075.791687,2.096039e+05,...,45.625221,-16.519905,45.574238,-11.216208,48.625031,-15.720529,54.754559,-15.151942,68.066071,sad


In [29]:
# Create the Excel file
pd.ExcelWriter('D:/UM/Project/Mozartify/project/Datasets/Features/Features1D/Features_shift.xlsx')

<pandas.io.excel._xlsxwriter.XlsxWriter at 0x2850f6d9400>

In [30]:
# Store the Feature into excel file
features_shift.to_excel('D:/UM/Project/Mozartify/project/Datasets/Features/Features1D/Features_shift.xlsx')

## 4. 2D Feature Extraction

In [31]:
n_noise = len(file_noise)
spec_noise = np.empty([n_noise, 1025, 1099])
mfccs_noise = np.empty([n_noise, 20, 1099])
mel_noise = np.empty([n_noise, 128, 1099])

n_shift = len(file_shift)
spec_shift = np.empty([n_shift, 1025, 1099])
mfccs_shift = np.empty([n_shift, 20, 1099])
mel_shift = np.empty([n_shift, 128, 1099])

In [38]:
def feature2d(file, max_frames=1099):
    try:
        y, sr = librosa.load(file)
    except:
        print('No such file')
        return None, None, None

    # Spectrogram
    stft = librosa.stft(y)
    stft_db = librosa.amplitude_to_db(abs(stft))
    spec = librosa.util.fix_length(stft_db, size=max_frames, axis=1)

    # MFCC
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfccs = librosa.util.fix_length(mfccs, size=max_frames, axis=1)

    # Mel
    s = librosa.feature.melspectrogram(y=y, sr=sr)
    mel = librosa.amplitude_to_db(s, ref=np.max)
    mel = librosa.util.fix_length(mel, size=max_frames, axis=1)

    return spec, mfccs, mel

# Initialize arrays for noise files
n_noise = len(file_noise)
spec_noise = np.empty([n_noise, 1025, 1099])
mfccs_noise = np.empty([n_noise, 20, 1099])
mel_noise = np.empty([n_noise, 128, 1099])

# Initialize arrays for shift files
n_shift = len(file_shift)
spec_shift = np.empty([n_shift, 1025, 1099])
mfccs_shift = np.empty([n_shift, 20, 1099])
mel_shift = np.empty([n_shift, 128, 1099])

# Process noise files
for i in range(n_noise):
    spec, mfccs, mel = feature2d(file_noise[i])
    if spec is not None:  # Ensure the file was processed correctly
        spec_noise[i], mfccs_noise[i], mel_noise[i] = spec, mfccs, mel

# Process shift files
for i in range(n_shift):
    spec, mfccs, mel = feature2d(file_shift[i])
    if spec is not None:  # Ensure the file was processed correctly
        spec_shift[i], mfccs_shift[i], mel_shift[i] = spec, mfccs, mel


In [39]:
def getMood(mood_list, file_list):
    for i in range(len(file_list)):
        m = os.path.basename(os.path.dirname(file_list[i]))
        mood_list.append(m)
    return mood_list

In [40]:
mood_noise_2d = []
mood_noise_2d = getMood(mood_noise_2d,file_noise)

In [41]:
mood_shift_2d = []
mood_shift_2d = getMood(mood_shift_2d,file_shift)

In [42]:
le = LabelEncoder()
moodtrans_noise =le.fit(np.array(mood_noise_2d))
moodtrans_noise=le.transform(np.array(mood_noise_2d))   

moodtrans_shift =le.fit(np.array(mood_shift_2d))
moodtrans_shift=le.transform(np.array(mood_shift_2d))

In [44]:
np.savez_compressed("D:/UM/Project/Mozartify/project/Datasets/Features/Features2D/Features_noise.npz", 
                    spec = spec_noise, 
                    mfcc = mfccs_noise,
                    mel = mel_noise,
                    target = moodtrans_noise)

In [45]:
np.savez_compressed("D:/UM/Project/Mozartify/project/Datasets/Features/Features2D/Features_shift.npz", 
                    spec = spec_shift, 
                    mfcc = mfccs_shift,
                    mel = mel_shift,
                    target = moodtrans_shift)