# Utilities for Thesis
### Figure generators and such

# Generic Imports

In [None]:
import os
import math
import numpy as np
import madmom
from scipy.io import wavfile
import matplotlib.pyplot as plt

# Initialization

### Debugger

In [None]:
debug = True

class Debugger:
    def __init__(self, debug):
        self.debug = debug
        
    def log(self, *msg):
        if(self.debug):
            print(msg)
            
dbg = Debugger(debug)
dbg.log('debugger initialized')

### Globals Variables and Configurations

In [None]:
FIG_FOLDER = 'utilities_for_thesis_figures'
AUDIO_FILE_NAME = 'zztop_badtothebone_mono_5sec.wav'
AUDIO_PATH_RELATIVE = '../../audio_for_thesis'

FIG_SIZE = (9,6)
DPI = 80

assert os.path.exists(FIG_FOLDER), 'Folder doesnt exist: {}'.format(FIG_FOLDER)
assert os.path.exists(AUDIO_PATH_RELATIVE), 'Folder doesnt exist: {}'.format(AUDIO_PATH_RELATIVE)

### Helpers

In [None]:
def get_fig_save_path(fig_name):
    return os.path.join(FIG_FOLDER, fig_name)

### Load Audio

In [None]:
audio_len = 5

audio_path = os.path.join(os.getcwd(), AUDIO_PATH_RELATIVE)
audio_path_absolute = os.path.abspath(audio_path)
assert os.path.exists(audio_path_absolute), 'Audio file path doesnt exist: {}'.format(audio_path_absolute)
    
audio_path_full = os.path.join(audio_path_absolute, AUDIO_FILE_NAME)
assert os.path.exists(audio_path_full), 'Audio file doesnt exist: {}'.format(audio_path_full)

fs, audio_buffer = wavfile.read(audio_path_full)

assert fs == 44100, 'Sampling rate should be 44100 Hz'
assert len(audio_buffer.shape) == 1, 'Audio should be mono'
assert len(audio_buffer) == audio_len * fs, 'Audio should be exactly %d seconds long' %(audio_len)

dbg.log(audio_path_full)
dbg.log('audio buffer shape:', audio_buffer.shape, 'sampling rate:', fs)

# Figures

### Time Domain Audio Signal Representation

In [None]:
plt.plot(audio_buffer/audio_buffer.max())
audio_ticks = np.arange(0, len(audio_buffer), fs/2)
plt.xticks(audio_ticks, ["{:.1f}".format(tick/fs) for tick in audio_ticks], rotation='45')
plt.xlim([0, len(audio_buffer)])
plt.xlabel('Time (Seconds)')
plt.ylabel('Amplitude (Normalized)');

### Frequency Domain Audio Signal Representation

In [None]:
'''
# Original:

frame_size = 2048
start_frame = 2*fs

frame = audio_buffer[start_frame:start_frame+frame_size]
spectrum = np.fft.fft(frame)

abs_spec_size = int(frame_size/2+1)
abs_spec = np.abs(spectrum)[:abs_spec_size]

fft_freqs = np.fft.fftfreq(frame_size)*fs
freq_ticks = np.arange(0, abs_spec_size, 128, dtype=int)
freq_tick_labels = ["{:.1f}Hz".format(np.abs(fft_freqs[idx])) for idx in freq_ticks]
print(fft_freqs[1024])


plt.plot(abs_spec/abs_spec.max())
plt.xticks(freq_ticks, freq_tick_labels, rotation="45");
'''

frame_size = audio_len * fs # 220500

frame = audio_buffer[0:frame_size] # take whole 5sec clip as frame
spectrum = np.fft.fft(frame) # fft resolution is same as size of audio_buffer (220500)

abs_spec_size = int(frame_size/2+1) # half cause mirrored (110251)
abs_spec = np.abs(spectrum)[:abs_spec_size] # absolute values of half

# frequency bin centers [0...0.5,-0.5...-0]
# result in requency values when multiplied by sampling rate (cause same as resolution) (same as specifying d=1/sampling_rate)
# with 220500 values
# will only be used until index half, cause they are then mirrored in the negative
fft_freqs = np.fft.fftfreq(frame_size)*fs
dbg.log(fft_freqs[110250])

# indices to be used, only go until 110251, cause of mirroring
# 110251 / 7 = 15750.14, round down to get 8 indices instead of 7
freq_ticks = np.arange(0, abs_spec_size, 15750, dtype=int)
# labels (with Hz as value) for the corresponding indices
freq_tick_labels = ["{:.1f}".format(np.abs(fft_freqs[idx])) for idx in freq_ticks]

plt.plot(abs_spec/abs_spec.max())
plt.xticks(freq_ticks, freq_tick_labels, rotation="45")

plt.xlabel('Frequency (Hz)')
plt.ylabel('Magnitude / Power (Normalized)')

dbg.log('spectrum resolution:', abs_spec.shape)
dbg.log('fft frequencies:', fft_freqs.shape)
dbg.log('indices to be used for freq. values:', freq_ticks)
dbg.log('frequencies at above indices taken from fft frequencies:', freq_tick_labels)

### Framing (frame size, hop size)

In [None]:
fig_save_path = get_fig_save_path('signal_before_framing.png')

frame_size = 128
hop_size = 96
frame_n = 3

start_sample = int(0.5*fs)

audio_segment = audio_buffer[start_sample: start_sample + ((frame_n-1)*(hop_size)) + frame_size]
dbg.log(audio_segment.size)


frames = []
for i in range(0, frame_n):
    start_curr = start_sample + i * hop_size
    f = audio_buffer[start_curr: start_curr +  frame_size]
    dbg.log(f.size)
    frames.append(f)

plt.figure(figsize=FIG_SIZE)
plt.plot(audio_segment/audio_segment.max())

audio_ticks = np.arange(0, audio_segment.size, int(audio_segment.size/9))
plt.xticks(audio_ticks, ["{:.0f}".format(tick) for tick in audio_ticks], rotation='45')
plt.xlim([0, audio_segment.size])
plt.xlabel('Sample')
plt.ylabel('Amplitude');

plt.savefig(fig_save_path, dpi=DPI)

x_axis_start, x_axis_end, y_axis_start, y_axis_end = plt.axis()

In [None]:
fig, ax = plt.subplots(3,1)

for i in range(0, frame_n):
    f = frames[i]
    f_norm = f/audio_segment.max()
    frame_in_segment = np.full(audio_segment.size, None)
    frame_in_segment[i*hop_size :i*hop_size + f_norm.size] = f_norm
        
    ax[i].plot(frame_in_segment)
    ax[i].axvspan(hop_size, hop_size + f.size, color='red', alpha=0.15)
    
    #ax[i].set_title('Frame {}'.format(i+1))
        
    if i == 2:
        ax[i].set_xlabel('Sample')
    ax[i].set_ylabel('Amplitude');
    
    ax[i].set_xlim([x_axis_start, x_axis_end])
    ax[i].set_ylim([y_axis_start, y_axis_end])
    
    if i != 2:
        ax[i].set_xticks([])
    
    if i == 2:
        audio_ticks = np.arange(0, audio_segment.size, int(audio_segment.size/9))
        ax[i].set_xticks(audio_ticks)
        ax[i].set_xticklabels(["{:.0f}".format(tick) for tick in audio_ticks], rotation=45)
        
plt.figure(figsize=(16, 12))
plt.subplots_adjust(hspace=0.25)

#plt.savefig('frames_split.png')
plt.show()
fig.savefig('frames_split.png', dpi=200)

In [None]:
f = frames[0]
plt.plot(f/audio_segment.max())
audio_ticks = np.arange(0, f.size, int(f.size/8))
plt.xticks(audio_ticks, ["{:.0f}".format(tick + (0*hop_size)) for tick in audio_ticks], rotation='45')
plt.xlim([0, f.size])

plt.ylim([y_axis_start, y_axis_end])

plt.xlabel('Sample')
plt.ylabel('Amplitude');

In [None]:
f = frames[1]
plt.plot(f/audio_segment.max())

plt.axvspan(0, f.size, color='red', alpha=0.15)

audio_ticks = np.arange(0, f.size, int(f.size/8))
plt.xticks(audio_ticks, ["{:.0f}".format(tick + (1*hop_size)) for tick in audio_ticks], rotation='45')
plt.xlim([0, f.size])

plt.ylim([y_axis_start, y_axis_end])

plt.xlabel('Sample')
plt.ylabel('Amplitude');

In [None]:
f = frames[2]
plt.plot(f/audio_segment.max())
audio_ticks = np.arange(0, f.size, int(f.size/8))
plt.xticks(audio_ticks, ["{:.0f}".format(tick + (2*hop_size)) for tick in audio_ticks], rotation='45')
plt.xlim([0, f.size])

plt.ylim([y_axis_start, y_axis_end])

plt.xlabel('Sample')
plt.ylabel('Amplitude');

### Hann Window

In [None]:
# widnow function

frame_size = 441
sample_idxs = np.arange(0, frame_size)

# 0 <= n <= N-1
def hann(n, N):
    return 0.5 * (1 - math.cos((2 * math.pi * n) / (N - 1)))

v_hann = np.vectorize(hann)
smoothing_coefficients = v_hann(sample_idxs, frame_size)

plt.plot(smoothing_coefficients)

x_ticks = np.arange(0, frame_size, 44, dtype=int)
plt.xticks(x_ticks, rotation="45")
plt.xlim([0, frame_size])

plt.xlabel('Sample');
plt.ylabel('Smoothing Coefficient');

dbg.log('sample values:', v_hann([0, 110 ,220 ,330 ,440], frame_size))

In [None]:
# audio frame (not smoothed)

frame_start = int(0.5 * fs) # start at 0.5sec
frame_end = frame_start + frame_size
frame = audio_buffer[frame_start: frame_end]

plt.plot(frame / frame.max())

x_ticks = np.arange(0, frame_size, 44, dtype=int)
plt.xticks(x_ticks, rotation="45")
plt.xlim([0, frame_size])

plt.xlabel('Sample');
plt.ylabel('Amplitude');

x_axis_start, x_axis_end, y_axis_start, y_axis_end = plt.axis()

In [None]:
# audio frame (smoothed)

v_hann = np.vectorize(hann)
smoothing_coefficients = v_hann(sample_idxs, frame_size)
smoothed_frame = smoothing_coefficients * frame

plt.plot(smoothed_frame / frame.max())

x_ticks = np.arange(0, frame_size, 44, dtype=int)
plt.xticks(x_ticks, rotation="45")
plt.xlim([x_axis_start, x_axis_end])

y_ticks = np.arange(-1, 1.5, 0.5)
plt.yticks(y_ticks)
plt.ylim([y_axis_start, y_axis_end])

plt.xlabel('Sample');
plt.ylabel('Amplitude');

### Time/Frequency Domain Audio Signal Representation

### Mel Scale

### Decibel Scale