In [1]:
import torch
import pyaudio
import wave
import os
import time
from AudioUtil import AudioUtil
from AudioClassifier import AudioClassifier

print(torch.__version__)
model = AudioClassifier()
state_dict = torch.load("data/models/V8_model_state_dict.pth")
model.load_state_dict(state_dict)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)
model.eval()

CHUNK = 1050
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100

2.3.0+cu118
Using device: cuda:0


In [2]:
def record_audio(duration=2):
    frames = []
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
    try:
        # Calculate the correct number of iterations to cover the duration
        num_frames = int((RATE / CHUNK) * duration)
        for _ in range(num_frames):
            data = stream.read(CHUNK)
            frames.append(data)
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()
    return frames

def save_temp_audio(frames, filename="temp_audio.wav"):
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(pyaudio.PyAudio().get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))

def process_audio_file(filename):
    class_names = {0: 'Noise', 1: 'Knock'} 
    aud = AudioUtil.open(filename)
    # os.remove(filename)
    reaud = AudioUtil.resample(aud, RATE)
    rechan = AudioUtil.rechannel(reaud, CHANNELS)
    dur_aud = AudioUtil.pad_trunc(rechan, 2000)
    sgram = AudioUtil.spectro_gram(dur_aud, n_mels=64, n_fft=1024, hop_len=None)
    sgram = sgram.to(device)

    outputs = model(sgram.unsqueeze(0))
    print(outputs)

    # Get the predicted class with the highest score
    _, prediction = torch.max(outputs, 1)
    predicted_classes = [class_names.get(p.item(), p.item()) for p in prediction][0]
    # print(prediction.item())
    print(predicted_classes)
    # if prediction > 0.5:
    #     print("Klopfen erkannt!")

In [3]:
# process_audio_file for every file in train
for file in os.listdir("data/validation"):
    print(f"Processing {file}")
    process_audio_file(f"data/validation/{file}")
# process_audio_file("data/knocks/knock_10.wav")
# process_audio_file("data/noises/noise_72.wav")

Processing knock_10.wav
tensor([[2.5305e-36, 1.0000e+00]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_10_var2.wav
tensor([[0., 1.]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_10_var4.wav
tensor([[4.2039e-45, 1.0000e+00]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_11_var0.wav
tensor([[9.3046e-43, 1.0000e+00]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_11_var1.wav
tensor([[4.6229e-42, 1.0000e+00]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_1_var0.wav
tensor([[2.2210e-36, 1.0000e+00]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_1_var7.wav
tensor([[3.1091e-36, 1.0000e+00]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_2_var5.wav
tensor([[1.1623e-39, 1.0000e+00]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
['Knock']
Processing knock_2_var9.wav
tensor([[5.7691e-42, 1.0000e+00]], device='c

In [18]:
print("Starte Klopfen-Erkennung...")
while True:
    start_time = time.time()
    # frames = record_audio()
    # save_temp_audio(frames)
    # process_audio_file("temp_audio.wav")
    # process_audio_file("noise_1.wav")
    process_audio_file("knock_1.wav")
    print(f"Time taken: {time.time() - start_time}")

Starte Klopfen-Erkennung...
['Knock']
Time taken: 0.006059408187866211
['Knock']
Time taken: 0.007019996643066406
['Knock']
Time taken: 0.005489826202392578
['Knock']
Time taken: 0.006508588790893555
['Knock']
Time taken: 0.0060253143310546875
['Knock']
Time taken: 0.009502649307250977
['Knock']
Time taken: 0.00699925422668457
['Knock']
Time taken: 0.006000995635986328
['Knock']
Time taken: 0.00600123405456543
['Knock']
Time taken: 0.007319927215576172
['Knock']
Time taken: 0.008533000946044922
['Knock']
Time taken: 0.005528450012207031
['Knock']
Time taken: 0.007016181945800781
['Knock']
Time taken: 0.0060007572174072266
['Knock']
Time taken: 0.007001399993896484
['Knock']
Time taken: 0.006018638610839844
['Knock']
Time taken: 0.005507230758666992
['Knock']
Time taken: 0.006014823913574219
['Knock']
Time taken: 0.006000995635986328
['Knock']
Time taken: 0.010013341903686523
['Knock']
Time taken: 0.00800180435180664
['Knock']
Time taken: 0.006023883819580078
['Knock']
Time taken: 0.005

KeyboardInterrupt: 