In [14]:
import librosa
import os
import pickle
import numpy as np
from scipy.stats import kurtosis,skew,mode
from pydub import AudioSegment
import time
import torchaudio
import torch

In [5]:
def get_file_paths_by_subfolder(root_path):
    result = {}
    # Loop through each subfolder in the root path
    for subfolder in os.listdir(root_path):
        subfolder_path = os.path.join(root_path, subfolder)
        
        # Check if it is indeed a folder
        if os.path.isdir(subfolder_path):
            file_paths = []
            
            # Loop through each subsubfolder in the subfolder
            for subsubfolder in os.listdir(subfolder_path):
                subsubfolder_path = os.path.join(subfolder_path, subsubfolder)
                
                # Check if it is indeed a folder
                if os.path.isdir(subsubfolder_path):
                    # Add file paths to the list
                    for file_name in os.listdir(subsubfolder_path):
                        file_path = os.path.join(subsubfolder_path, file_name)
                        
                        # Ensure it is a file
                        if os.path.isfile(file_path):
                            file_paths.append(file_path)
            
            # Add to result dictionary
            result[subfolder] = file_paths

    return result

def get_audio_paths(root_path):
    results = {}
    
    for audio_path in os.listdir(root_path):
        # Get the full path by joining root_path and audio_path
        full_path = os.path.join(root_path, audio_path)
        
        # Only add files (not directories)
        if os.path.isfile(full_path):
            filename = os.path.basename(audio_path)
            file_name_wo_ext, _ = os.path.splitext(filename)
            results[file_name_wo_ext] = full_path

    return results

def combine_wav_files(file_dict, output_path="../../../data/cleaned_combined"):
    for key in file_dict.keys():
        combined_audio = AudioSegment.empty()

        for file_path in file_dict[key]:
            audio = AudioSegment.from_wav(file_path)
            combined_audio += audio
        
        combined_audio.export(f"{output_path}/{key}.wav", format="wav")

In [6]:
def save_audio_segment(segment, sample_rate, output_file_path):
    # Convert the numpy array back to AudioSegment
    audio_segment = AudioSegment(
        segment.tobytes(),
        frame_rate=sample_rate,
        sample_width=segment.dtype.itemsize,
        channels=1  # Mono channel
    )
    
    # Export the segment as a WAV file
    audio_segment.export(output_file_path, format="wav")

# def segment_audio(audio_file_path):
#     # Load the audio file
#     audio = AudioSegment.from_file(audio_file_path)
    
#     # Set duration for each segment in milliseconds and get the sample rate
#     segment_duration_ms = 4 * 1000  # 4 seconds in milliseconds
#     sample_rate = audio.frame_rate
#     samples_per_segment = segment_duration_ms * sample_rate // 1000  # Calculate samples per 4 seconds

#     # Convert the entire audio to a numpy array
#     audio_array = np.array(audio.get_array_of_samples())

#     # Calculate the number of 4-second segments and trim any extra samples
#     num_segments = len(audio_array) // samples_per_segment
#     trimmed_audio_array = audio_array[:num_segments * samples_per_segment]

#     # Reshape the array into 4-second segments
#     segments = trimmed_audio_array.reshape(num_segments, samples_per_segment)
    
#     return segments, sample_rate  # Return both segments and the sample rate

def segment_audio(audio_file_path):
    # Load the audio file
    audio = AudioSegment.from_file(audio_file_path)
    
    # Set duration for each segment in milliseconds and get the sample rate
    segment_duration_ms = 1 * 1000  # 4 seconds in milliseconds
    sample_rate = audio.frame_rate
    samples_per_segment = segment_duration_ms * sample_rate // 1000  # Calculate samples per 4 seconds

    # Convert the entire audio to a numpy array
    audio_array = np.array(audio.get_array_of_samples())

    # Normalize the audio to the range [-1.0, 1.0]
    audio_array = audio_array.astype(np.float32)  # Convert to float32
    audio_array /= np.max(np.abs(audio_array))  # Normalize to [-1, 1]

    # Calculate the number of 4-second segments and trim any extra samples
    num_segments = len(audio_array) // samples_per_segment
    trimmed_audio_array = audio_array[:num_segments * samples_per_segment]

    # Reshape the array into 4-second segments
    segments = trimmed_audio_array.reshape(num_segments, samples_per_segment)
    
    return segments, sample_rate  # Return both segments and the sample rate

def segment_audio_wrapper(file_dict):
    segment_dict = {}

    for key in file_dict.keys():
        segment_dict[key], _ = segment_audio(file_dict[key])
        
    return segment_dict

In [None]:
def extract_features(audio_segments, sample_rate=16000):
    
    mfcc_time_list = []
    feature_time_list = []
    pitch_time_list = []

    feature_list = []
    sample_limit = 600
    for index, segment in enumerate(audio_segments[0:sample_limit]):

        mfcc_start_time = time.time()
        mfcc_data = librosa.feature.mfcc(y=segment, sr=sample_rate, n_mfcc=20)
        
        mfcc_time = time.time() - mfcc_start_time

        feature_start_time = time.time()
        # Calculating various statistic measures on the MFCC coefficients
        mean_mfcc = np.mean(mfcc_data, axis=1)
        median_mfcc = np.median(mfcc_data, axis=1)
        std_mfcc = np.std(mfcc_data, axis=1)
        skew_mfcc = skew(mfcc_data, axis=1)
        kurt_mfcc = kurtosis(mfcc_data, axis=1)
        maximum_mfcc = np.amax(mfcc_data, axis=1)
        minimum_mfcc = np.amin(mfcc_data, axis=1)
        
        feature_time = time.time() - feature_start_time


        pitch_start_time = time.time()
        #Pitch extraction
        f0, voiced_flag, voiced_probs = librosa.pyin(
            segment, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')
        )
        
        # Handle NaNs in pitch
        if np.all(np.isnan(f0)):
            mean_pitch, median_pitch = 0, 0  # Set default if no pitch is detected
        else:
            mean_pitch = np.nanmean(f0)  # Mean pitch, ignoring NaNs
            median_pitch = np.nanmedian(f0)  # Median pitch, ignoring NaNs

        pitch_time = time.time() - pitch_start_time

        segment_features = np.concatenate((
            mean_mfcc, median_mfcc, std_mfcc, skew_mfcc, kurt_mfcc, maximum_mfcc, minimum_mfcc,
            [mean_pitch, median_pitch]  # Add pitch statistics to the feature list
        ))

        mfcc_time_list.append(mfcc_time)
        feature_time_list.append(feature_time)
        pitch_time_list.append(pitch_time)

        feature_list.append(segment_features)

        if index == sample_limit * .1:
            print("10 percent")
        elif index== sample_limit * .2:
            print("20 percent")
        elif index== sample_limit * .3:
            print("30 percent")
        elif index== sample_limit * .4:
            print("40 percent")
        elif index== sample_limit * .5:
            print("50 percent")
        elif index== sample_limit * .6:
            print("60 percent")
        elif index== sample_limit * .7:
            print("70 percent")
        elif index== sample_limit * .8:
            print("80 percent")
        elif index== sample_limit * .9:
            print("90 percent")


    # print(sum(mfcc_time_list))
    # print(sum(feature_time_list))
    # print(sum(pitch_time_list))
    return(feature_list)

def extract_features_wrapper(segment_dict, save_to):
    feature_dict = {}

    for key in segment_dict.keys():
        extracted_features = extract_features(segment_dict[key]) 
        feature_dict[key] = extracted_features

        with open(f"../../../data/extracted_features_v2/seperate/features_20_{key}.pickle", "wb") as file:
            pickle.dump(extracted_features, file)
    
    with open(f"../../../data/extracted_features_v2/{save_to}.pickle", "wb") as file:
        pickle.dump(feature_dict, file)


In [23]:
def extract_format_frequencies(audio_segments, sample_rate=16000):
    
    format_time_list = []
    feature_list = []
    sample_limit = 300
    for index, segment in enumerate(audio_segments[0:sample_limit]):

        format_start_time = time.time()
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=segment, sr=sample_rate))
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=segment, sr=sample_rate))
        spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=segment))
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=segment, sr=sample_rate))
        format_time = time.time() - format_start_time
        
        segment_features = np.array([
            spectral_centroid.item() if isinstance(spectral_centroid, np.ndarray) else spectral_centroid,
            spectral_bandwidth.item() if isinstance(spectral_bandwidth, np.ndarray) else spectral_bandwidth,
            spectral_flatness.item() if isinstance(spectral_flatness, np.ndarray) else spectral_flatness,
            spectral_rolloff.item() if isinstance(spectral_rolloff, np.ndarray) else spectral_rolloff
        ])

        feature_list.append(segment_features)

        if index == int(sample_limit * .1):
            print("10 percent")
        elif index == int(sample_limit * .2):
            print("20 percent")
        elif index == int(sample_limit * .3):
            print("30 percent")
        elif index == int(sample_limit * .4):
            print("40 percent")
        elif index == int(sample_limit * .5):
            print("50 percent")
        elif index == int(sample_limit * .6):
            print("60 percent")
        elif index == int(sample_limit * .7):
            print("70 percent")
        elif index == int(sample_limit * .8):
            print("80 percent")
        elif index == int(sample_limit * .9):
            print("90 percent")

        format_time_list.append(format_time)
    
    print("Format extraction time:", sum(format_time_list))
    return feature_list

def extract_format_frequencies_wrapper(segment_dict):
    feature_dict = {}

    for key in segment_dict.keys():
        extracted_features = extract_format_frequencies(segment_dict[key]) 
        feature_dict[key] = extracted_features

        with open(f"../../../data/extracted_features_v2/seperate/features_20_{key}.pickle", "wb") as file:
            pickle.dump(extracted_features, file)
    
    with open(f"../../../data/extracted_features_v2/format_frequencies.pickle", "wb") as file:
        pickle.dump(feature_dict, file)

In [16]:
def extract_lfcc_features(audio_segments, sample_rate=16000):
    
    lfcc_time_list = []
    feature_time_list = []

    feature_list = []
    sample_limit = 300
    for index, segment in enumerate(audio_segments[0:sample_limit]):

        lfcc_start_time = time.time()
        
        # Linear Frequency Cepstral Coefficients (LFCC) extraction
        # Compute the linear frequency spectrogram
        linear_spectrogram = librosa.stft(segment)
        mag, phase = librosa.magphase(linear_spectrogram)
        
        # Apply log scaling
        log_magnitude = np.log1p(mag)
        
        # Apply Discrete Cosine Transform (DCT) to get LFCC coefficients
        lfcc_data = librosa.feature.mfcc(S=log_magnitude, sr=sample_rate, n_mfcc=20)
        
        lfcc_time = time.time() - lfcc_start_time

        feature_start_time = time.time()
        # Calculating various statistic measures on the LFCC coefficients
        mean_lfcc = np.mean(lfcc_data, axis=1)
        median_lfcc = np.median(lfcc_data, axis=1)
        std_lfcc = np.std(lfcc_data, axis=1)
        skew_lfcc = skew(lfcc_data, axis=1)
        kurt_lfcc = kurtosis(lfcc_data, axis=1)
        maximum_lfcc = np.amax(lfcc_data, axis=1)
        minimum_lfcc = np.amin(lfcc_data, axis=1)
        
        feature_time = time.time() - feature_start_time

        segment_features = np.concatenate((
            mean_lfcc, median_lfcc, std_lfcc, skew_lfcc, kurt_lfcc, maximum_lfcc, minimum_lfcc
        ))

        lfcc_time_list.append(lfcc_time)
        feature_time_list.append(feature_time)

        feature_list.append(segment_features)

        # Progress tracking
        if index == sample_limit * .1:
            print("10 percent")
        elif index == sample_limit * .2:
            print("20 percent")
        elif index == sample_limit * .3:
            print("30 percent")
        elif index == sample_limit * .4:
            print("40 percent")
        elif index == sample_limit * .5:
            print("50 percent")
        elif index == sample_limit * .6:
            print("60 percent")
        elif index == sample_limit * .7:
            print("70 percent")
        elif index == sample_limit * .8:
            print("80 percent")
        elif index == sample_limit * .9:
            print("90 percent")

    return feature_list

def extract_lfcc_features_wrapper(segment_dict):
    feature_dict = {}

    for key in segment_dict.keys():
        extracted_features = extract_lfcc_features(segment_dict[key]) 
        feature_dict[key] = extracted_features


    with open(f"../../../data/extracted_features_v2/lfcc_features.pickle", "wb") as file:
        pickle.dump(feature_dict, file)

In [8]:
file_dictionary = get_file_paths_by_subfolder(r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned")

In [None]:
# DO NOT RUN AGAIN 
combine_wav_files(file_dictionary)

In [8]:
audio_paths = get_audio_paths(r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined")

print(audio_paths.keys())
for key in audio_paths.keys():
    print(audio_paths[key])

dict_keys(['1069', '19', '201', '250', '254', '26', '27', '289', '298', '311', '32', '3240', '39', '40', '4297', '60', '78', '7800', '83', '87'])
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\1069.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\19.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\201.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\250.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\254.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\26.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\27.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\289.wav
C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\cleaned_combined\298.wav
C:\Computer Science Programs\Fall_2024\EE502_B

In [9]:
segment_dict = segment_audio_wrapper(audio_paths)

print(segment_dict.keys())

dict_keys(['1069', '19', '201', '250', '254', '26', '27', '289', '298', '311', '32', '3240', '39', '40', '4297', '60', '78', '7800', '83', '87'])


In [None]:
extract_features_wrapper(segment_dict, "mfcc_20_no_pitch_10min")

In [None]:
extract_format_frequencies_wrapper(segment_dict)

In [17]:
extract_lfcc_features_wrapper(segment_dict)

10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
90 percent
10 percent

In [18]:
extraced_features = extract_features(segment_dict['26'])

with open(f"../../../data/extracted_features_v2/speaker_26.pickle", "wb") as file:
    pickle.dump(extraced_features, file)

10 percent
20 percent
30 percent
40 percent
50 percent
60 percent
70 percent
80 percent
80 percent
