In [1]:
import torch 
from src.models import Join_fusion
from torchinfo import summary
import pandas as pd
import numpy as np
import librosa
import sys
sys.path.append("./src")
import utils

To load the checkpoint correctly, the model should should have in the same shape as in the checkpoint, otherwise the the model weights will not be loaded

In [2]:
model_config = {
                "inputdim" : 64,
                "outputdim" : 2
                }

model = Join_fusion(model_config, model_config['inputdim'], model_config["outputdim"])

#ckp_path = "/data/dilleswari/target_with_ConvNeXt_Encoder/Target-sound-event-detection/bash/experiments/Join_fusion/2025-07-15_11-06-07_b7a29f2a613d11f0bdebac1f6ba1010e/run_model_24_loss=-0.4270.pt"
ckp_path = "/data/dilleswari/target_with_ConvNeXt_Encoder/Target-sound-event-detection/bash/experiments/Join_fusion/2025-07-21_11-01-07_f99dc35e65f311f095f5ac1f6ba1010e/run_model_8_loss=-0.2439.pt"
ckp = torch.load(ckp_path, map_location='cpu')

try:
    model.load_state_dict(ckp)
    print("Successfully loaded model weights")
except:
    print("Unsuccessful in loading model weights")

model.eval()
#summary(model, input_size= [(1,501,64), (1,1001,104)], col_names = ["input_size","output_size","num_params"], device='cpu')
                            # mixture   reference

Successfully loaded model weights


Join_fusion(
  (detection): CDur_fusion(
    (gru): GRU(768, 768, batch_first=True, bidirectional=True)
    (fusion): Fusion(
      (fuse_layer1): conv1d(
        (conv): Conv1d(768, 3072, kernel_size=(1,), stride=(1,))
        (act): ReLU()
      )
      (fuse_layer2): conv1d(
        (conv): Conv1d(768, 3072, kernel_size=(1,), stride=(1,))
        (act): ReLU()
      )
      (avg_pool): AvgPool1d(kernel_size=(4,), stride=(4,), padding=(0,))
    )
    (fc): Linear(in_features=1536, out_features=1536, bias=True)
    (outputlayer): Linear(in_features=1536, out_features=2, bias=True)
  )
  (AudioEncoder): ConvNeXt(
    (spectrogram_extractor): Spectrogram(
      (stft): STFT(
        (conv_real): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
        (conv_imag): Conv1d(1, 513, kernel_size=(1024,), stride=(320,), bias=False)
      )
    )
    (logmel_extractor): LogmelFilterBank()
    (spec_augmenter): SpecAugmentation(
      (time_dropper): DropStripes()
      (freq_drop

In [3]:
import pandas as pd
tsv_file = "/data/dilleswari/target_with_ConvNeXt_Encoder/Target-sound-event-detection/data/flists/urban_sed_test_strong_modified.tsv"
df = pd.read_csv(tsv_file, sep='\t')
df

Unnamed: 0,filename,onset,offset,event_label
0,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,4.516348,7.253393,jackhammer
1,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,5.106336,7.081667,engine_idling
2,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,5.813735,7.963874,engine_idling
3,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,6.353976,7.430690,dog_bark
4,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,8.037359,9.902325,drilling
...,...,...,...,...
9951,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,7.794474,10.000000,gun_shot
9952,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,0.016581,3.391536,dog_bark
9953,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,1.222317,2.517615,car_horn
9954,/data/dilleswari/URBAN_SED/audio2/test/soundsc...,5.517929,6.631534,gun_shot


In [4]:
reference_list = []

for _, row in df.iterrows():
    filename, onset, offset, event_label= row['filename'], row['onset'], row['offset'], row['event_label']
    filename = filename.replace("/data/dilleswari/URBAN_SED/audio2/test/","")

    reference_entry = {
                            'event_label': event_label,
                            'event_onset': onset,
                            'event_offset': offset,
                            'file': filename
                        }
    reference_list.append(reference_entry)

In [5]:
estimated_list = []

file_path = "/data/dilleswari/target_with_ConvNeXt_Encoder/Target-sound-event-detection/bash/experiments/Join_fusion/2025-07-21_11-01-07_f99dc35e65f311f095f5ac1f6ba1010e/hard_predictions_urban_sed_test_strong.txt"
with open(file_path, 'r') as file:
    lines = file.readlines()
    lines = lines[1:]

for line in lines:
    out = line.strip().split('\t')

    filename = out[0]
    onset = float(out[1])
    offset = float(out[2])
    event_label = out[3]

    estimated_entry = {
                            'event_label': event_label,
                            'event_onset': onset,
                            'event_offset': offset,
                            'file': filename
                         }
    estimated_list.append(estimated_entry)

In [6]:
import sed_eval
import dcase_util
from tqdm import tqdm

reference_event_list = dcase_util.containers.MetaDataContainer(reference_list)
estimated_event_list = dcase_util.containers.MetaDataContainer(estimated_list)

# segment based metrics, change segment length by modifying time_resolution(in sec)
segment_based_metrics = sed_eval.sound_event.SegmentBasedMetrics(
    event_label_list=reference_event_list.unique_event_labels,
    time_resolution=1.0 #10 frames
)

for filename in tqdm(reference_event_list.unique_files):
    
    reference_event_list_for_current_file = reference_event_list.filter(filename=filename)
    estimated_event_list_for_current_file = estimated_event_list.filter(filename=filename)

    segment_based_metrics.evaluate(
        reference_event_list=reference_event_list_for_current_file,
        estimated_event_list=estimated_event_list_for_current_file
    )

# print report
print(segment_based_metrics)



100%|██████████| 2000/2000 [01:51<00:00, 18.01it/s]

Segment based metrics
  Evaluated length                  : 18081.02 sec
  Evaluated files                   : 2000 
  Segment length                    : 1.00 sec

  Overall metrics (micro-average)
  F-measure
    F-measure (F1)                  : 83.36 %
    Precision                       : 81.64 %
    Recall                          : 85.16 %
  Error rate
    Error rate (ER)                 : 0.30 
    Substitution rate               : 0.04 
    Deletion rate                   : 0.10 
    Insertion rate                  : 0.15 
  Accuracy
    Sensitivity                     : 85.16 %
    Specificity                     : 96.83 %
    Balanced accuracy               : 90.99 %
    Accuracy                        : 95.17 %

  Class-wise average metrics (macro-average)
  F-measure
    F-measure (F1)                  : 83.15 %
    Precision                       : 81.92 %
    Recall                          : 84.85 %
  Error rate
    Error rate (ER)                 : 0.34 
    Deletion r


