# Install

In [1]:
!pip install panns-inference

Collecting panns-inference
  Downloading panns_inference-0.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting torchlibrosa (from panns-inference)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Downloading panns_inference-0.1.1-py3-none-any.whl (8.3 kB)
Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Installing collected packages: torchlibrosa, panns-inference
Successfully installed panns-inference-0.1.1 torchlibrosa-0.1.0


# Collegamento con il Drive

In [12]:
from google.colab import drive
drive.mount("/content/drive")

!ls '/content/drive/My Drive/SonART/Audio_Matteo'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Demo_Ambienti.mp3   Demo_Guerra.mp3   'Demo_Strumenti Musicali.mp3'
 Demo_Animali.mp3    Demo_Meteo.mp3     Demo_Veicoli.mp3
 Demo_Casa.mp3	     Demo_Persone.mp3  'LISTA SAMPLES DEMO'


# Import

In [3]:
import os
import matplotlib.pyplot as plt
import numpy as np
import librosa
import panns_inference
from panns_inference import AudioTagging, SoundEventDetection, labels

# Sound event detection with Panns

In [4]:
def print_top_classes_per_frame(framewise_output):
    """Stampa la classe con probabilità massima per ogni frame (testuale)."""
    top_classes = np.argmax(framewise_output, axis=1)  # (time_steps,)
    ix_to_lb = {i: label for i, label in enumerate(labels)}

    for i, class_idx in enumerate(top_classes):
        label = ix_to_lb[int(class_idx)]
        print(f"Frame {i:04d}: {label}")


In [5]:
def print_top_classes_per_frame_with_threshold(framewise_output, threshold=0.3):
    """Stampa la classe con probabilità massima per ogni frame se supera la soglia."""
    top_classes = np.argmax(framewise_output, axis=1)        # (time_steps,)
    top_probs = np.max(framewise_output, axis=1)             # (time_steps,)
    ix_to_lb = {i: label for i, label in enumerate(labels)}  # Mappa indice → label

    for i, (idx, prob) in enumerate(zip(top_classes, top_probs)):
        if prob >= threshold:
            label = ix_to_lb[int(idx)]
            print(f"Frame {i:04d}: {label} (prob={prob:.2f})")


In [6]:
def print_sed_predictions_with_repetition_threshold(framewise_output, threshold=0.4):
    """Stampa le classi rilevate con SED, se ripetute almeno 2 volte e sopra soglia."""

    top_classes = np.argmax(framewise_output, axis=1)        # (time_steps,)
    top_probs = np.max(framewise_output, axis=1)             # (time_steps,)
    ix_to_lb = {i: label for i, label in enumerate(labels)}

    last_printed_word = None
    previous_word = None
    tag_list = []

    for i in range(len(top_classes)):
        idx = int(top_classes[i])
        prob = top_probs[i]
        predicted_class = ix_to_lb[idx]

        if predicted_class == previous_word:
            if predicted_class != last_printed_word:
                if prob >= threshold:
                    print(f"Frame {i:04d}: {predicted_class} (prob={prob:.3f})")
                    tag_list.append(predicted_class)
                    last_printed_word = predicted_class
        else:
            previous_word = predicted_class  # aggiorna solo se cambia

    return tag_list



In [62]:
device = 'cpu' # 'cuda' | 'cpu'
audio_path = "/content/drive/My Drive/SonART/Audio_Matteo/Demo_Veicoli.mp3"
(audio, _) = librosa.core.load(audio_path, sr=32000, mono=True)
audio = audio[None, :]  # (batch_size, segment_samples)

print('------ Sound event detection ------')
sed = SoundEventDetection(
    checkpoint_path=None,
    device=device,
    interpolate_mode='nearest', # 'nearest'
)
framewise_output = sed.inference(audio)
"""(batch_size, time_steps, classes_num)"""

------ Sound event detection ------
Checkpoint path: /root/panns_data/Cnn14_DecisionLevelMax.pth
Using CPU.


'(batch_size, time_steps, classes_num)'

In [57]:
print_top_classes_per_frame(framewise_output[0])

Frame 0000: Music
Frame 0001: Music
Frame 0002: Music
Frame 0003: Music
Frame 0004: Music
Frame 0005: Music
Frame 0006: Music
Frame 0007: Music
Frame 0008: Music
Frame 0009: Music
Frame 0010: Music
Frame 0011: Music
Frame 0012: Music
Frame 0013: Music
Frame 0014: Music
Frame 0015: Music
Frame 0016: Music
Frame 0017: Music
Frame 0018: Music
Frame 0019: Music
Frame 0020: Music
Frame 0021: Music
Frame 0022: Music
Frame 0023: Music
Frame 0024: Music
Frame 0025: Music
Frame 0026: Music
Frame 0027: Music
Frame 0028: Music
Frame 0029: Music
Frame 0030: Music
Frame 0031: Music
Frame 0032: Music
Frame 0033: Music
Frame 0034: Music
Frame 0035: Music
Frame 0036: Music
Frame 0037: Music
Frame 0038: Music
Frame 0039: Music
Frame 0040: Music
Frame 0041: Music
Frame 0042: Music
Frame 0043: Music
Frame 0044: Music
Frame 0045: Music
Frame 0046: Music
Frame 0047: Music
Frame 0048: Music
Frame 0049: Music
Frame 0050: Music
Frame 0051: Music
Frame 0052: Music
Frame 0053: Music
Frame 0054: Music
Frame 0055

In [58]:
print_top_classes_per_frame_with_threshold(framewise_output[0])

Frame 0000: Music (prob=0.40)
Frame 0001: Music (prob=0.40)
Frame 0002: Music (prob=0.40)
Frame 0003: Music (prob=0.40)
Frame 0004: Music (prob=0.40)
Frame 0005: Music (prob=0.40)
Frame 0006: Music (prob=0.40)
Frame 0007: Music (prob=0.40)
Frame 0008: Music (prob=0.40)
Frame 0009: Music (prob=0.40)
Frame 0010: Music (prob=0.40)
Frame 0011: Music (prob=0.40)
Frame 0012: Music (prob=0.40)
Frame 0013: Music (prob=0.40)
Frame 0014: Music (prob=0.40)
Frame 0015: Music (prob=0.40)
Frame 0016: Music (prob=0.40)
Frame 0017: Music (prob=0.40)
Frame 0018: Music (prob=0.40)
Frame 0019: Music (prob=0.40)
Frame 0020: Music (prob=0.40)
Frame 0021: Music (prob=0.40)
Frame 0022: Music (prob=0.40)
Frame 0023: Music (prob=0.40)
Frame 0024: Music (prob=0.40)
Frame 0025: Music (prob=0.40)
Frame 0026: Music (prob=0.40)
Frame 0027: Music (prob=0.40)
Frame 0028: Music (prob=0.40)
Frame 0029: Music (prob=0.40)
Frame 0030: Music (prob=0.40)
Frame 0031: Music (prob=0.40)
Frame 0032: Music (prob=0.44)
Frame 0033

In [65]:
print_sed_predictions_with_repetition_threshold(framewise_output[0], 0)

Frame 0001: Rumble (prob=0.004)
Frame 0129: Car (prob=0.030)
Frame 0193: Whoosh, swoosh, swish (prob=0.029)
Frame 0385: Pink noise (prob=0.030)
Frame 0513: Waves, surf (prob=0.158)
Frame 0801: Explosion (prob=0.036)
Frame 0897: Sound effect (prob=0.013)
Frame 0961: Music (prob=0.001)
Frame 0993: Inside, small room (prob=0.001)
Frame 1025: Foghorn (prob=0.045)
Frame 1153: Music (prob=0.018)
Frame 1281: Foghorn (prob=0.042)
Frame 1313: Music (prob=0.044)
Frame 1441: Outside, rural or natural (prob=0.012)
Frame 1633: Vehicle (prob=0.034)
Frame 2433: Animal (prob=0.021)
Frame 2497: Whoosh, swoosh, swish (prob=0.017)
Frame 2561: Vehicle (prob=0.021)
Frame 2625: Outside, rural or natural (prob=0.001)
Frame 2657: Inside, small room (prob=0.001)
Frame 2689: Outside, urban or manmade (prob=0.012)
Frame 2721: Vehicle (prob=0.225)
Frame 4001: Outside, urban or manmade (prob=0.000)
Frame 4065: Train (prob=0.079)
Frame 4257: Vehicle (prob=0.035)
Frame 4385: Train (prob=0.320)
Frame 4769: Vehicle (p

['Rumble',
 'Car',
 'Whoosh, swoosh, swish',
 'Pink noise',
 'Waves, surf',
 'Explosion',
 'Sound effect',
 'Music',
 'Inside, small room',
 'Foghorn',
 'Music',
 'Foghorn',
 'Music',
 'Outside, rural or natural',
 'Vehicle',
 'Animal',
 'Whoosh, swoosh, swish',
 'Vehicle',
 'Outside, rural or natural',
 'Inside, small room',
 'Outside, urban or manmade',
 'Vehicle',
 'Outside, urban or manmade',
 'Train',
 'Vehicle',
 'Train',
 'Vehicle',
 'Silence']