In [None]:
from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures, MidTermFeatures
import os
import numpy as np
import json
from pydub import AudioSegment
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr

# Extract medium term features

In [None]:
audio_files_dir = 'data/Audio_Data'
feature_vectors_dir = 'medium_term_audio_vectors'

audio_labels_path = 'audio_labels_dict.json'
with open(audio_labels_path, 'r') as file:
    audio_labels_dict = json.load(file)

In [None]:
# Window size of 100 ms
win = 0.1
# Step size of 100 ms for sub-windows
step = 0.1

# For mid-term feature extraction
mid_window = 1.0
mid_step = 1.0
short_window = 0.050
short_step = 0.050

def convert_to_wav(m4a_file_path, wav_file_path):
    sound = AudioSegment.from_file(m4a_file_path, format='m4a')
    file_handle = sound.export(wav_file_path, format='wav')
    return file_handle

def extract_features(audio_file_path):
    # Read the audio file
    [Fs, X] = audioBasicIO.read_audio_file(audio_file_path)
    X = audioBasicIO.stereo_to_mono(X)
    F, short_features, mid_feature_names = MidTermFeatures.mid_feature_extraction(X, Fs, mid_window*Fs, mid_step*Fs, short_window*Fs, short_step*Fs)
    F = np.transpose(F)
    print(f'shape: {F.shape}')
    return F

for filename in os.listdir(audio_files_dir):
    if filename.lower().endswith('.m4a') and filename in audio_labels_dict:
        m4a_file_path = os.path.join(audio_files_dir, filename)
        wav_file_path = m4a_file_path.replace('.m4a', '.wav')

        convert_to_wav(m4a_file_path, wav_file_path)

        features = extract_features(wav_file_path)
        
        # Save the feature vectors to a file
        feature_vector_path = os.path.join(feature_vectors_dir, filename[:-4] + '.npy')

        np.save(feature_vector_path, features)
        print(f'Processed {filename}')

    elif filename.lower().endswith('.mp3') and filename in audio_labels_dict:
        mp3_file_path = os.path.join(audio_files_dir, filename)
        features = extract_features(mp3_file_path)
        feature_vector_path = os.path.join(feature_vectors_dir, filename[:-4] + '.npy')
        np.save(feature_vector_path, features)
        print(f'Processed {filename}')

In [None]:
y = []
for filename in os.listdir(feature_vectors_dir):
  fnpm3 = f'{filename[:-4]}{str(".mp3")}'
  fnm4a = f'{filename[:-4]}{str(".m4a")}'
  if fnpm3 in audio_labels_dict:
    y.append(audio_labels_dict[fnpm3])
  elif fnm4a in audio_labels_dict:
    y.append(audio_labels_dict[fnm4a])

y = torch.tensor(y, dtype=torch.float32)

count = 0
for filename in os.listdir(feature_vectors_dir):
    count += 1

assert(len(y) == count)

max_rows = 0
# Iterate through all .npy files in the directory
for filename in os.listdir(feature_vectors_dir):
    if filename.endswith('.npy'):
        file_path = os.path.join(feature_vectors_dir, filename)
        
        # Load the .npy file
        data = np.load(file_path)
        
        # Update the maximum number of rows if this file has more rows
        if data.shape[0] > max_rows:
            max_rows = data.shape[0]

embedding_size = data.shape[1]

print(f"The maximum value of n (number of rows) among all .npy files is: {max_rows}")

X = torch.zeros(len(y), max_rows, embedding_size)
currIndex = 0
for filename in os.listdir(feature_vectors_dir):
  file_path = os.path.join(feature_vectors_dir, filename) 
  e = torch.from_numpy(np.load(file_path))

  fnpm3 = f'{filename[:-4]}{str(".mp3")}'
  fnm4a = f'{filename[:-4]}{str(".m4a")}'

  if fnpm3 in audio_labels_dict or fnm4a in audio_labels_dict:
    num_vectors_to_pad = max_rows - e.shape[0]
    zero_padding = torch.zeros(num_vectors_to_pad, e.shape[1])
    padded_tensor = torch.cat((e, zero_padding), dim=0)
    X[currIndex] = padded_tensor
    currIndex += 1

X_np = X.numpy()
y_np = y.numpy()
np.savez(f'X-y-medium-term-audio.npz', X=X_np, y=y_np)

# Extract Short Term Features

In [None]:
feature_vectors_dir = 'short_term_audio_vectors'

In [None]:
# for short term feature extraction
short_window = 0.050
short_step = 0.025

def convert_to_wav(m4a_file_path, wav_file_path):
    sound = AudioSegment.from_file(m4a_file_path, format='m4a')
    file_handle = sound.export(wav_file_path, format='wav')
    return file_handle

def extract_features(audio_file_path):
    # Read the audio file
    [Fs, X] = audioBasicIO.read_audio_file(audio_file_path)
    X = audioBasicIO.stereo_to_mono(X)
    F, f_names = ShortTermFeatures.feature_extraction(X, Fs, short_window*Fs, short_step*Fs)
    F = np.transpose(F)
    print(f'shape: {F.shape}')
    return F

for filename in os.listdir(audio_files_dir):
    if filename.lower().endswith('.m4a') and filename in audio_labels_dict:
        m4a_file_path = os.path.join(audio_files_dir, filename)
        wav_file_path = m4a_file_path.replace('.m4a', '.wav')

        convert_to_wav(m4a_file_path, wav_file_path)

        features = extract_features(wav_file_path)
        
        # Save the feature vectors to a file
        feature_vector_path = os.path.join(feature_vectors_dir, filename[:-4] + '.npy')

        np.save(feature_vector_path, features)
        print(f'Processed {filename}')

    elif filename.lower().endswith('.mp3') and filename in audio_labels_dict:
        mp3_file_path = os.path.join(audio_files_dir, filename)
        features = extract_features(mp3_file_path)
        feature_vector_path = os.path.join(feature_vectors_dir, filename[:-4] + '.npy')
        np.save(feature_vector_path, features)
        print(f'Processed {filename}')

In [None]:
y = []
for filename in os.listdir(feature_vectors_dir):
  fnpm3 = f'{filename[:-4]}{str(".mp3")}'
  fnm4a = f'{filename[:-4]}{str(".m4a")}'
  if fnpm3 in audio_labels_dict:
    y.append(audio_labels_dict[fnpm3])
  elif fnm4a in audio_labels_dict:
    y.append(audio_labels_dict[fnm4a])

y = torch.tensor(y, dtype=torch.float32)

count = 0
for filename in os.listdir(feature_vectors_dir):
    count += 1

assert(len(y) == count)

max_rows = 0
# Iterate through all .npy files in the directory
for filename in os.listdir(feature_vectors_dir):
    if filename.endswith('.npy'):
        file_path = os.path.join(feature_vectors_dir, filename)
        
        # Load the .npy file
        data = np.load(file_path)
        
        # Update the maximum number of rows if this file has more rows
        if data.shape[0] > max_rows:
            max_rows = data.shape[0]

embedding_size = data.shape[1]

print(f"The maximum value of n (number of rows) among all .npy files is: {max_rows}")

X = torch.zeros(len(y), max_rows, embedding_size)
currIndex = 0
for filename in os.listdir(feature_vectors_dir):
  file_path = os.path.join(feature_vectors_dir, filename) 
  e = torch.from_numpy(np.load(file_path))

  fnpm3 = f'{filename[:-4]}{str(".mp3")}'
  fnm4a = f'{filename[:-4]}{str(".m4a")}'

  if fnpm3 in audio_labels_dict or fnm4a in audio_labels_dict:
    num_vectors_to_pad = max_rows - e.shape[0]
    zero_padding = torch.zeros(num_vectors_to_pad, e.shape[1])
    padded_tensor = torch.cat((e, zero_padding), dim=0)
    X[currIndex] = padded_tensor
    currIndex += 1

X_np = X.numpy()
y_np = y.numpy()
np.savez(f'X-y-short-term-audio.npz', X=X_np, y=y_np)