In [2]:
## Runned using python 3.12.1

## Importing Libraries
import numpy as np 
import pandas as pd

#%pip install librosa
import librosa

# Load data, we use the ID column as the index, so we do not need to drop it
train_data = pd.read_csv('DSL_Winter_Project_2025/development.csv', index_col=0)         # no need to skip rows
evaluation_data = pd.read_csv('DSL_Winter_Project_2025/evaluation.csv', index_col=0)

# Concatenate 'DSL_Winter_Project_2025/' with the existing paths in the df 'path' column
train_data['path'] = 'DSL_Winter_Project_2025/' + train_data['path']
evaluation_data['path'] = 'DSL_Winter_Project_2025/' + evaluation_data['path']

In [3]:
# Data preprocessing:
# 1. convert the 'tempo' column from string to float
# 2. encode categrical columns
# 3. extract features from the audio files
# 4. drop columns: path, sampling_rate (constant), num_characters (higly correlated with num_words)

# 1.
train_preprocessing = train_data.drop(columns=['age'])      # drop the 'age' column since it is not present in the evaluation data and we could not combine the two datasets
combined_data = pd.concat([train_preprocessing, evaluation_data], axis=0).reset_index(drop=True)            # https://pandas.pydata.org/docs/reference/api/pandas.concat.html
combined_data['tempo'] = ((combined_data['tempo'].str.replace('[', '')).str.replace(']', '')).astype(float) # https://pandas.pydata.org/docs/reference/api/pandas.Series.str.replace.html

# 2.
# Since we noted many ethnicities have few samples, we can group them into a single category 'Other' before applying one-hot encoding. This way we can reduce the dimensionality.
ethnicity_counts = train_data['ethnicity'].value_counts()
frequent_ethnicities = ethnicity_counts[ethnicity_counts > 100].index
combined_data['ethnicity'] = combined_data['ethnicity'].map(lambda x: x if x in frequent_ethnicities else 'Other')

# Correct the typo in the 'gender' column
combined_data.loc[combined_data['gender'] == 'famale', 'gender'] = 'female'

columns_to_encode = ['gender', 'ethnicity']
combined_data = pd.get_dummies(combined_data, columns=columns_to_encode)    # one-hot encoding: # https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

In [4]:
# 3.

## Define a frequency filter
from scipy.signal import butter, lfilter

sr = 22050

# Define the cutoff frequency for the low pass filter,
# we cut at 4500 Hz to avoid losing too much information
# NOTE: The cutoff frequency can be defined in Hz, but we need to specify the sr as an argument in the butter function
cutoff_frequency = 4500         # gain = -3dB 

# Define the low pass filter (precompute the coefficients)
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.butter.html#butter
b, a = butter(Wn=cutoff_frequency, fs=sr, N=5) # N = order

# Apply the low pass filter to the audio signal
def lowpass_filter(data, b=b, a=a):
    y = lfilter(b, a, data)
    return y # filtered signal

In [None]:
## Feature extraction
def extract_audio_features(file_path, sr=sr):
    try:
        # Load audio file
        # https://librosa.org/doc/latest/core.html#audio-loading: load audio file as floating point time series
        y, sr= librosa.load(file_path, sr=sr)    # sr = sampling rate

        # apply the low pass filter
        y = lowpass_filter(y)

        # Check if the audio is empty
        if np.all(y == 0):
            print(f"Audio vuoto: {file_path}")
            return None

        # Set the window size to 0.74 seconds, except when the time series do not have enough samples
        n_fft = min(16384, len(y)) # the choice of a power of 2 is due to the FFT algorithm (improve effciency)
        if n_fft < 16384:
            print(f"Audio equal to {n_fft} samples < 16384: {file_path}")
        
        # Mel Frequency Cepstral Coefficients (MFCCs) 
        # https://librosa.org/doc/latest/generated/librosa.feature.mfcc.html#librosa-feature-mfcc
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=n_fft)                               # returns an array of size (n_mfcc, t) where t is the number of frames (each window computation is a frame)
        mfccs_mean = np.mean(mfccs, axis=1)                                                            # compute the mean of each MFCC coefficient over all frames

        # Chromagram
        # https://librosa.org/doc/latest/generated/librosa.feature.chroma_stft.html#librosa-feature-chroma-stft
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=n_fft, hop_length = int(n_fft/2))
        chroma_mean = np.mean(chroma, axis=1)


        # Dizionario con le caratteristiche estratte
        features = {
            'mfcc_1': mfccs_mean[0],
            'mfcc_2': mfccs_mean[1],
            'mfcc_3': mfccs_mean[2],
            'mfcc_4': mfccs_mean[3],
            'mfcc_5': mfccs_mean[4],
            'mfcc_6': mfccs_mean[5],
            'mfcc_7': mfccs_mean[6],
            'mfcc_8': mfccs_mean[7],
            'mfcc_9': mfccs_mean[8],
            'mfcc_10': mfccs_mean[9],
            'mfcc_11': mfccs_mean[10],
            'mfcc_12': mfccs_mean[11],
            'mfcc_13': mfccs_mean[12],
            'chroma_1': chroma_mean[0],
            'chroma_2': chroma_mean[1],
            'chroma_3': chroma_mean[2],
            'chroma_4': chroma_mean[3],
            'chroma_5': chroma_mean[4],
            'chroma_6': chroma_mean[5],
            'chroma_7': chroma_mean[6],
            'chroma_8': chroma_mean[7],
            'chroma_9': chroma_mean[8],
            'chroma_10': chroma_mean[9],
            'chroma_11': chroma_mean[10],
            'chroma_12': chroma_mean[11],
        }

    except Exception as e:
        print(f"Error {file_path}: {e}")
        return None

    return features

# List to store the extracted audio features
audio_features = []

for file_path in combined_data['path']:
    features = extract_audio_features(file_path)
    audio_features.append(features)

# Convert the list of dictionaries to a DataFrame
df_audio_features = pd.DataFrame(audio_features)

# Concatenate the extracted audio features to the DataFrame
combined_data = pd.concat([combined_data.reset_index(drop=True), df_audio_features.reset_index(drop=True)], axis=1)

In [6]:
# 4.
columns_to_drop = ['path', 'sampling_rate', 'num_characters', 'min_pitch', 'max_pitch']
combined_data = combined_data.drop(columns=columns_to_drop)

# Split the data back into train and evaluation
train_data = pd.concat([combined_data.iloc[:len(train_data)], train_data['age']], axis=1)
evaluation_data = combined_data.iloc[len(train_data):].reset_index(drop=True)

## Save the dataframe after feature extraction
train_data.to_csv('train_data_features_extracted_filtered.csv')
evaluation_data.to_csv('evaluation_data_features_extracted_filtered.csv')

In [None]:
# Display the first rows of the train_data DataFrame
pd.set_option('display.max_columns', None)
display(train_data.head())

# Display the first rows of the evaluation_data DataFrame
display(evaluation_data.head())