In [1]:
import os
import tempfile
import logging
import json
from typing import Dict, Any, Optional, Union
from datetime import datetime

import torch
import numpy as np
import soundfile as sf
import noisereduce as nr
from scipy import signal
import whisper

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
audio_path = "/home/tlr4fe/git/voice_assist/data/test_audios/poor-audio.ogg"

In [3]:
audio, sr = sf.read(audio_path)

In [4]:
# Convert to mono
if audio.ndim > 1:
    audio = np.mean(audio, axis=1)

In [5]:
audio

array([ 1.76884209e-10,  1.18608362e-09, -2.15611312e-09, ...,
       -3.70647176e-04, -2.64654635e-04, -2.06871358e-04],
      shape=(4632192,))

In [6]:
# High-pass filter
cutoff = 100
sos = signal.butter(10, cutoff, 'hp', fs=sr, output='sos')
audio = signal.sosfilt(sos, audio)

In [7]:
audio

array([ 1.69009734e-10,  1.11788904e-09, -2.17803525e-09, ...,
        7.24238202e-03,  7.91309061e-03,  8.52347647e-03],
      shape=(4632192,))

In [8]:
# Denoise
audio = nr.reduce_noise(y=audio, sr=sr)
audio

memmap([-2.05699430e-12, -4.34362443e-12,  8.34037090e-12, ...,
        -3.71256276e-04, -7.64117077e-04,  1.90784599e-04],
       shape=(4632192,))

In [9]:
# Normalize
peak = np.max(np.abs(audio)) + 1e-8
audio = audio / peak
audio

array([-3.99177942e-12, -8.42918746e-12,  1.61852275e-11, ...,
       -7.20455647e-04, -1.48283678e-03,  3.70234391e-04],
      shape=(4632192,))