# Preprocessing steps:
0. Get the labels for each audio clip first as currently the labels are a distribution of user responses
1. Convert the audio files into log_spectrograms.
2. Parameters:
- Sample rate: 22050Hz
- Length of audio: 10 second fragments
- 149 point spectrograms
- Final dimension per 10 second clip: 313*149
- n_fft = 2048
- hop_length = 512


# 0. Getting the labels
- Currently the labels are a distribution of scores and we need to convert them to labels.
- For simplification, we will take the highest emotion score for each track and use that as the label.



In [113]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
import pandas as pd
import numpy as np

metadata = pd.read_csv("/content/drive/MyDrive/emotifymusic/Music Emotion Dataset.csv")
metadata = metadata.rename(columns={' joyful_activation': ' joyful'})
metadata["track id"] = metadata["track id"]%100
metadata = metadata.replace({
    'track id': {0: 100}})
metadata["file_path"] = metadata[" genre"] + "/" + metadata["track id"].astype(str) + ".mp3"
metadata.head()

Unnamed: 0,track id,genre,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful,tension,sadness,mood,liked,disliked,age,gender,mother tongue,file_path
0,1,classical,0,1,0,0,0,0,1,1,0,3,1,0,21,1,English,classical/1.mp3
1,1,classical,0,0,0,1,0,0,0,0,0,3,0,1,41,1,Dutch,classical/1.mp3
2,1,classical,0,0,0,1,0,0,0,0,1,3,0,0,24,1,English,classical/1.mp3
3,1,classical,0,0,0,0,1,0,0,0,0,3,0,0,32,0,Spanish,classical/1.mp3
4,1,classical,0,0,0,1,1,0,0,0,0,4,0,1,21,0,English,classical/1.mp3


In [67]:
#metadata.columns
mean_scores = metadata.groupby('file_path')[[' amazement', ' solemnity', ' tenderness',' nostalgia', ' calmness', ' power', ' joyful', ' tension',' sadness']].mean()
mean_scores.head()

Unnamed: 0_level_0,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful,tension,sadness
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
classical/1.mp3,0.145833,0.333333,0.208333,0.291667,0.625,0.020833,0.083333,0.0625,0.3125
classical/10.mp3,0.276596,0.212766,0.021277,0.042553,0.021277,0.510638,0.638298,0.212766,0.021277
classical/100.mp3,0.117647,0.470588,0.0,0.117647,0.176471,0.529412,0.117647,0.294118,0.117647
classical/11.mp3,0.042553,0.297872,0.234043,0.382979,0.361702,0.042553,0.06383,0.12766,0.382979
classical/12.mp3,0.313725,0.058824,0.156863,0.156863,0.156863,0.078431,0.745098,0.078431,0.039216


In [68]:
mean_scores['label'] = mean_scores.apply(lambda row: row.idxmax(), axis=1)
mean_scores.head()

Unnamed: 0_level_0,amazement,solemnity,tenderness,nostalgia,calmness,power,joyful,tension,sadness,label
file_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
classical/1.mp3,0.145833,0.333333,0.208333,0.291667,0.625,0.020833,0.083333,0.0625,0.3125,calmness
classical/10.mp3,0.276596,0.212766,0.021277,0.042553,0.021277,0.510638,0.638298,0.212766,0.021277,joyful
classical/100.mp3,0.117647,0.470588,0.0,0.117647,0.176471,0.529412,0.117647,0.294118,0.117647,power
classical/11.mp3,0.042553,0.297872,0.234043,0.382979,0.361702,0.042553,0.06383,0.12766,0.382979,nostalgia
classical/12.mp3,0.313725,0.058824,0.156863,0.156863,0.156863,0.078431,0.745098,0.078431,0.039216,joyful


In [72]:
genre = ["classical"]*100 + ["electronic"] *100 + ["pop"]*100 + ["rock"]*100
processed_metadata = mean_scores[['label']]
#processed_metadata["genre"] = genre
processed_metadata.head()

Unnamed: 0_level_0,label
file_path,Unnamed: 1_level_1
classical/1.mp3,calmness
classical/10.mp3,joyful
classical/100.mp3,power
classical/11.mp3,nostalgia
classical/12.mp3,joyful


In [73]:
processed_metadata = processed_metadata.reset_index()
processed_metadata.head()

Unnamed: 0,file_path,label
0,classical/1.mp3,calmness
1,classical/10.mp3,joyful
2,classical/100.mp3,power
3,classical/11.mp3,nostalgia
4,classical/12.mp3,joyful


In [74]:
#processed_metadata.to_csv("/content/drive/MyDrive/emotifymusic/processed_metadata.csv")

# 1. Converting the audio files into spectrograms

In [109]:
metadata = pd.read_csv("/content/drive/MyDrive/emotifymusic/processed_metadata.csv",index_col=0)
metadata["file_path"][0]

'classical/1.mp3'

In [114]:
import librosa
root_path = "/content/drive/MyDrive/emotifymusic/"
root_output_path = "/content/drive/MyDrive/emotifymusic/spectrograms/"
sample_rate = 22050
samples_per_segment = int(10 * sample_rate) # The paper calls for 10 second excerpts
num_segments = 6
mapping = [' amazement', ' solemnity', ' tenderness',' nostalgia', ' calmness', ' power', ' joyful', ' tension',' sadness']
data = {
    "file_path":[],
    "output_path":[],
    "label":[]
}

# Return a list of 6 separated segments of equal length
def split_audio(audio_path):
  signal,sr = librosa.load(audio_path,sr = sample_rate)
  signal_normalized = librosa.util.normalize(signal) # Normalise amplitude as the paper calls for it
  segments = []
  for i in range(num_segments):
    start = i * samples_per_segment
    end = start + samples_per_segment
    if len(signal[start:end]) != samples_per_segment:
      continue
    else:
      segments.append(signal[start:end])
  return segments

def prepare_audio(segments,file_path,label):
  for i in range(len(segments)): # For each 10sec segment
    stft = librosa.stft(segments[i], n_fft=2048,hop_length=512)[:-1] # Get short time fourier transform frames
    spectrogram = np.abs(stft)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    output_path = file_path.replace("/","_")
    data["file_path"].append(file_path)
    data["output_path"].append(output_path)
    data["label"].append(mapping.index(label))
    np.save(root_output_path+output_path+".npy",log_spectrogram)

  return data

def prepare_dataset(metadata):
  for index,row in metadata.iterrows():
    audio_path = root_path + row["file_path"]
    segments = split_audio(audio_path)
    data.update(prepare_audio(segments,row["file_path"],row["label"]))
  return None

prepare_dataset(metadata)


In [117]:
with open("/content/drive/MyDrive/emotifymusic/labels.json", 'w') as f:
    json.dump(data, f, indent=4)  # Use indent to pretty-print JSON with 4 spaces