In [2]:
import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt
import os
import soundfile
import librosa
import shutil
from PIL import Image
import soundfile as sf
import speech_recognition as sound_rec
from utils import *


# Load data 

In [3]:
# Set directories
audio_dir = 'data/audioMNIST/data_OG_trimmed'
files = os.listdir(audio_dir)
wav_files = [f for f in files if f.endswith('.wav')]

recon_dir = 'results_mat/reconstructed_results'
recon_files = os.listdir(recon_dir)
recon_wav_files = [f for f in recon_files if f.endswith('.wav')]


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data/audioMNIST/data_OG_trimmed'

# Load and create time domain plots  

In [116]:
#OG

# Define paths
audio_dir = audio_dir  # Replace with the path to your reconstructed files
output_dir = "comparison/time_OG/"

# Clean the output directory before saving new plots
if os.path.exists(output_dir):
    # Remove all files in the output directory
    for file in os.listdir(output_dir):
        file_path = os.path.join(output_dir, file)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
        except Exception as e:
            print(f"Error deleting file {file_path}: {e}")
else:
    # Create the directory if it doesn't exist
    os.makedirs(output_dir)

# Get a list of all files in the folder_path
all_files = os.listdir(audio_dir)

# Process each file in the reconstruction directory
for file in all_files:

    file_path = os.path.join(audio_dir, file)
    
    # Load the audio data
    sr , _ = wavfile.read(file_path)
    y, sr = librosa.load(file_path, sr = sr)

    # Create a plot
    plt.figure()
    plt.plot(y)
    plt.title(f"{file} (Original)")
    plt.xlabel('Time (samples)')
    plt.ylabel('Amplitude')
    
    # Set the y-axis limits to -0.35 to 0.35
    plt.ylim([-0.35, 0.35])
    
    # Save the plot to the specified folder
    plot_path = os.path.join(output_dir, f"{file}.png")
    plt.savefig(plot_path)
    plt.close()  # Close the figure to free up memory



In [117]:
# Define the folder path containing your audio files
folder_path = 'results_mat/reconstructed_results'  # Replace with the path to your audio files
output_dir = "comparison/time_recon/"

# Clean the output directory before saving new plots
if os.path.exists(output_dir):
    # Remove all files in the output directory
    for file in os.listdir(output_dir):
        file_path = os.path.join(output_dir, file)
        try:
            if os.path.isfile(file_path):
                os.remove(file_path)
        except Exception as e:
            print(f"Error deleting file {file_path}: {e}")
else:
    # Create the directory if it doesn't exist
    os.makedirs(output_dir)

# Get a list of all files in the folder_path
all_files = os.listdir(folder_path)

# Loop through each file and generate a plot
for file in all_files:
    file_path = os.path.join(folder_path, file)
    
    # Read the sample rate using scipy
    sr, _ = wavfile.read(file_path)
    
    # Load the audio data using librosa with the original sample rate
    y, sr = librosa.load(file_path, sr=sr)
    
    # Create a time axis in samples
    time_samples = np.arange(len(y))
    
    # Create a plot
    plt.figure()
    plt.plot(time_samples, y)
    plt.title(f"{file.replace('.npy', '')} (Reconstructed)")
    plt.xlabel('Time (samples)')
    plt.ylabel('Amplitude')
    
    # Set the y-axis limits to -0.35 to 0.35
    plt.ylim([-0.35, 0.35])

    # Save the plot to the specified folder
    plot_path = os.path.join(output_dir, f"{file.replace('.npy', '')}.png")
    plt.savefig(plot_path)
    plt.close()  # Close the figure to free up memory


# Spectrograms

### Helper functions

In [118]:
def pad_signal(S, max_length):
    dif_sample = abs(len(S) - max_length) # Calculate the differnce in desired signal length and the current signal length

    if len(S) % 2 != 0:
        padded_y = np.pad(S, (dif_sample//2, dif_sample//2 + 1), 'constant', constant_values=(0, 0))
    else:
        padded_y = np.pad(S, (dif_sample//2, dif_sample//2), 'constant', constant_values=(0, 0))

    return padded_y

def load_data(folder_path):
    # Get a list of all files in the folder_path
    all_files = os.listdir(folder_path)

    # Determine samplerate of signal
    sr , _ = wavfile.read(folder_path + "/" + all_files[0])

    audio_data = []
    # Load the audio files
    for file in all_files:
        if file.endswith('.wav'):
            y = librosa.load(folder_path + "/" + file, sr=sr)[0]
            audio_data.append((y,file))

    return audio_data

def trim_signals(data, max_db):
    trimmed_audio = []
    for y, file in data:
        if len(y) > 8000:
            continue # Skip audio files longer than 8000 samples
        y_trimmed, _ = librosa.effects.trim(y, ref=np.mean , top_db=10)
        trimmed_audio.append((y_trimmed, file))
    return trimmed_audio

def pad_segment(s, window_size):
    dif_sample = abs(len(s) - window_size) # Calculate the differnce in desired signal length and the current signal length
    if len(s) % 2 != 0:
        padded_y = np.pad(s, (dif_sample//2, dif_sample//2 + 1), 'constant', constant_values=(0, 0))
    else:
        padded_y = np.pad(s, (dif_sample//2, dif_sample//2), 'constant', constant_values=(0, 0))
    return padded_y

def stft(x, frame_size=256, overlap=128):
    num_segments = len(x) // overlap - 1 # Calculate the numbxer of segments
    freq_bins = frame_size // 2 + 1 # Define the number of frequency bins
    spec = np.zeros((freq_bins, num_segments))
    t = 0
    for i in range(0, len(x)-frame_size, overlap):
        seg = x[i:i+frame_size]
        seg = np.hamming(len(seg)) * seg # Apply the hamming window
        if len(seg) < frame_size: # if the segment is shorter than the window size, we need to pad it (usually the last segment)
            seg = pad_segment(seg, frame_size)
        spec[:,t] = np.abs(np.fft.rfft(seg, n=frame_size)) #Compute the magnitude of frequency components
        t += 1
    return spec

def gen_spectgrams(audio_data, padding_length, n_fft=128):
    # Compute the mel spectrogram
    spects = []
    for audio in audio_data:
        if len(audio[0]) > 8000:
            continue # Skip audio files longer than 8000 samples
        y_trimmed, _ = librosa.effects.trim(audio[0], ref=np.mean , top_db=10)
        padded_y = pad_signal(y_trimmed, padding_length)
        spec = stft(padded_y, frame_size=n_fft, overlap=n_fft//2)
        epsilon = 1e-4
        spec_db = np.array(20 * np.log10(spec + epsilon))
        spects.append((spec_db, audio[1]))
    return spects

In [119]:

# Directory containing audio files
audio_dir = 'data/audioMNIST/data_OG_trimmed/'
files = os.listdir(audio_dir)

# Create output directory for the plots
output_dir = "comparison/spec_OG/"
if os.path.exists(output_dir):
    # Remove all files in the output directory
    for f in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, f))
else:
    # If the directory doesn't exist, create it
    os.makedirs(output_dir)

# Load audio data
audio_data = load_data(audio_dir)

# Specify the desired padding length (change as needed)
padding_length = 8000

# Generate spectrograms
for audio, name in audio_data:
    if len(audio) > 8000:  # Skip audio files longer than 8000 samples
        continue
    
    # Pad and compute the STFT
    padded_audio = pad_signal(audio, padding_length)
    spec = stft(padded_audio, frame_size=256, overlap=128)
    
    # Convert to decibels
    epsilon = 1e-4
    spec_db = np.array(20 * np.log10(spec + epsilon))
    
    # Create the time axis for the spectrogram
    n_frames_spec = spec_db.shape[1]
    time_spec = np.linspace(0, len(padded_audio) / 22050, n_frames_spec)  # Adjust sr if necessary

    # Create the plot
    plt.figure(figsize=(10, 4))  # Set figure size
    plt.imshow(spec_db, aspect='auto', origin='lower', extent=[0, len(padded_audio)/22050, 0, 11025], cmap='gray', interpolation='none')
    plt.xlim([0, len(padded_audio) / 22050])
    plt.title(f"Spectrogram for {name} (Trimmed Original)")
    plt.ylabel("Frequency (Hz)")
    plt.colorbar(label="Magnitude (dB)")
    plt.xlabel("Time (s)")

    # Save the plot
    plot_path = os.path.join(output_dir, f"{name}.png")
    plt.savefig(plot_path)
    plt.close()  # Close the figure to free up memory


In [120]:
# Your helper functions here (pad_signal, load_data, trim_signals, pad_segment, stft, gen_spectgrams)

# Directory containing audio files
audio_dir = 'results_mat/reconstructed_results/'
files = os.listdir(audio_dir)

# Create output directory for the plots
output_dir = "comparison/spec_recon/"
if os.path.exists(output_dir):
    # Remove all files in the output directory
    for f in os.listdir(output_dir):
        os.remove(os.path.join(output_dir, f))
else:
    # If the directory doesn't exist, create it
    os.makedirs(output_dir)

# Load audio data
audio_data = load_data(audio_dir)

# Specify the desired padding length (change as needed)
padding_length = 8000

# Generate spectrograms
for audio, name in audio_data:
    if len(audio) > 8000:  # Skip audio files longer than 8000 samples
        continue
    
    # Pad and compute the STFT
    padded_audio = pad_signal(audio, padding_length)
    spec = stft(padded_audio, frame_size=256, overlap=128)
    
    # Convert to decibels
    epsilon = 1e-4
    spec_db = np.array(20 * np.log10(spec + epsilon))
    
    # Create the time axis for the spectrogram
    n_frames_spec = spec_db.shape[1]
    time_spec = np.linspace(0, len(padded_audio) / 22050, n_frames_spec)  # Adjust sr if necessary

    # Create the plot
    plt.figure(figsize=(10, 4))  # Set figure size
    plt.imshow(spec_db, aspect='auto', origin='lower', extent=[0, len(padded_audio)/22050, 0, 11025], cmap='grey', interpolation='none')
    plt.xlim([0, len(padded_audio) / 22050])
    plt.title(f"Spectrogram for {name.replace('.npy', '')} (Reconstructed)")
    plt.ylabel("Frequency (Hz)")
    plt.colorbar(label="Magnitude (dB)")
    plt.xlabel("Time (s)")

    # Save the plot
    plot_path = os.path.join(output_dir, f"{name.replace('.npy', '')}.png")
    plt.savefig(plot_path)
    plt.close()  # Close the figure to free up memory


# Comparison plots 

In [121]:
# Time domain 

audio_dir = 'comparison/time_OG/'
recon_dir = 'comparison/time_recon'

output_dir_com_time = "comparison/time_compare"


if os.path.exists(output_dir_com_time):
    # Remove all files in the output directory
    for f in os.listdir(output_dir_com_time):
        os.remove(os.path.join(output_dir_com_time, f))
else:
    # If the directory doesn't exist, create it
    os.makedirs(output_dir_com_time)

files_com = os.listdir(recon_dir)


for file in files_com:

    # Open the images
    OG_path = Image.open(f"comparison/time_OG/{file}")
    recon_path = Image.open(f"comparison/time_recon/{file}")

    # Get the size of each image
    (width1, height1) = OG_path.size
    (width2, height2) = recon_path.size

    # Create a new image with a width equal to the sum of both images' widths
    combined = Image.new('RGB', (width1 + width2, max(height1, height2)))

    # Paste the images into the new combined image
    combined.paste(OG_path, (0, 0))
    combined.paste(recon_path, (width1, 0))

    # Save or display the result
    #combined.show()  # To display
    combined.save(f'{output_dir_com_time}/{file}')  # To save


## Spec compare

In [122]:
# Spec 

audio_dir = 'comparison/spec_OG/'
recon_dir = 'comparison/spec_recon'

output_dir_com_spec = "comparison/spec_compare"


if os.path.exists(output_dir_com_spec):
    # Remove all files in the output directory
    for f in os.listdir(output_dir_com_spec):
        os.remove(os.path.join(output_dir_com_spec, f))
else:
    # If the directory doesn't exist, create it
    os.makedirs(output_dir_com_spec)

files_com = os.listdir(recon_dir)


for file in files_com:

    # Open the images
    OG_path = Image.open(f"comparison/spec_OG/{file}")
    recon_path = Image.open(f"comparison/spec_recon/{file}")

    # Get the size of each image
    (width1, height1) = OG_path.size
    (width2, height2) = recon_path.size

    # Create a new image with a width equal to the sum of both images' widths
    combined = Image.new('RGB', (width1 + width2, max(height1, height2)))

    # Paste the images into the new combined image
    combined.paste(OG_path, (0, 0))
    combined.paste(recon_path, (width1, 0))

    # Save or display the result
    #combined.show()  # To display
    combined.save(f'{output_dir_com_spec}/{file}')  # To save


# Audio accuracy metric

In [123]:
# Define the folder containing the audio files
audio_folder = "results_mat/reconstructed_results/"

# Initialize the recognizer
recognizer = sound_rec.Recognizer()

# Iterate over each file in the folder
for filename in os.listdir(audio_folder):
    # Check if the file is a .wav file (you can adjust this if needed)
    if filename.endswith(".wav"):
        file_path = os.path.join(audio_folder, filename)
        
        # Load the audio data
        audio_data, sample_rate = sf.read(file_path)
        
        # Save it as a temporary .wav file
        temp_wav_path = "temp_audio.wav"
        sf.write(temp_wav_path, audio_data, sample_rate)
        
        # Use the temporary audio file for recognition
        with sound_rec.AudioFile(temp_wav_path) as source:
            audio = recognizer.record(source)
        
        # Perform speech recognition using Google's Web Speech API (default method)
        try:
            transcription = recognizer.recognize_google(audio)
            print(f"Transcription for {filename}: {transcription}")
        except sound_rec.UnknownValueError:
            print(f"Could not understand the audio in {filename}.")
        except sound_rec.RequestError as e:
            print(f"API request error for {filename}: {e}")

Could not understand the audio in 4_yweweler_34.npy.wav.
Could not understand the audio in 5_nicolas_6.npy.wav.
Could not understand the audio in 3_lucas_20.npy.wav.
Could not understand the audio in 7_nicolas_15.npy.wav.
Could not understand the audio in 8_yweweler_7.npy.wav.
Could not understand the audio in 0_lucas_3.npy.wav.
Could not understand the audio in 2_lucas_47.npy.wav.
