In [1]:
import os
import torchaudio

In [5]:
DATA_ROOT = "/data1/malto/therness/data/Hackathon"
SAMPLE = "/defect-weld/excessive_penetration_weld_3_10-02-22_butt/10-02-22-0021-01/10-02-22-0021-01.flac"

In [7]:
DATA_ROOT = "/data1/malto/therness/data/Hackathon"

SAMPLE = "defect-weld/excessive_penetration_weld_3_10-02-22_butt/10-02-22-0021-01/10-02-22-0021-01.flac"

sample_path = os.path.join(DATA_ROOT, SAMPLE)

waveform, sr = torchaudio.load(sample_path)

In [8]:
print(waveform.shape)  # [channels, samples]
print(sr)              # sample rate

torch.Size([1, 608000])
16000


In [10]:
waveform.view(1, 16000, -1).shape

torch.Size([1, 16000, 38])

In [5]:
import glob

# Discover all .flac files — works for both flat (sampleData/sample_id/)
# and labeled (data/{good,bad}/sample_id/) structures
flac_files = sorted(glob.glob(os.path.join(DATA_ROOT, "**", "*.flac"), recursive=True))

durations = []
for f in flac_files:
    wf, file_sr = torchaudio.load(f)
    num_channels, num_frames = wf.shape
    dur_s = num_frames / file_sr
    sample_id = os.path.basename(os.path.dirname(f))
    durations.append({'sample_id': sample_id, 'duration_s': dur_s,
                      'sr': file_sr, 'channels': num_channels})

print(f"Found {len(durations)} audio files\n")
for d in durations:
    print(f"  {d['sample_id']:25s}  {d['duration_s']:7.2f}s  sr={d['sr']}  ch={d['channels']}")

durs = [d['duration_s'] for d in durations]
print(f"\nMin: {min(durs):.2f}s  Max: {max(durs):.2f}s  Mean: {sum(durs)/len(durs):.2f}s")
all_same = min(durs) == max(durs)
print(f"All same length: {all_same}")
if not all_same:
    print(f"→ Consider setting max_length_in_s={max(durs):.1f} (or smaller) in the config")

Found 10 audio files

  08-17-22-0011-00             38.00s  sr=16000  ch=1
  08-17-22-0012-00             38.00s  sr=16000  ch=1
  08-17-22-0013-00             38.00s  sr=16000  ch=1
  08-17-22-0014-00             38.00s  sr=16000  ch=1
  08-17-22-0015-00             38.00s  sr=16000  ch=1
  08-17-22-0016-00             38.00s  sr=16000  ch=1
  08-17-22-0017-00             38.00s  sr=16000  ch=1
  08-17-22-0018-00             38.00s  sr=16000  ch=1
  08-17-22-0019-00             38.00s  sr=16000  ch=1
  08-18-22-0020-00             38.00s  sr=16000  ch=1

Min: 38.00s  Max: 38.00s  Mean: 38.00s
All same length: True


# Audio Processing Pipeline — Training Tutorial

## How `audio_processing.py` is organized

```
audio_processing.py
├── DEFAULT_AUDIO_CFG   ← dict with all tunable parameters
├── AudioTransform      ← nn.Module: raw waveform → log-mel spectrogram
└── AudioDataset        ← Dataset: discovers .flac files, applies transform, returns dicts
```

## Data flow

```
.flac file
  │
  ├─ torchaudio.load() → waveform (1, 608000)
  │
  ├─ AudioTransform.forward()
  │     resample (if sr != 16kHz)
  │     → mono mixdown (if stereo)
  │     → pad/truncate to max_length_in_s
  │     → MelSpectrogram → (1, n_mels, T)
  │     → AmplitudeToDB  → log scale
  │     → normalize      → zero-mean, unit-variance
  │
  └─ output: {'audio': (1, 40, T), 'sample_id': str, 'label': int, 'label_name': str}
```

## How to use in training

### Step 1: Configure — tune these for experiments
```python
from audio_processing import AudioDataset, DEFAULT_AUDIO_CFG

cfg = {**DEFAULT_AUDIO_CFG,
    'n_mels': 64,           # override any param
    'max_length_in_s': 38.0,
}
```

### Step 2: Create datasets — just point to the data root
```python
# For sampleData (flat, no labels):
ds = AudioDataset("sampleData", cfg, labeled=False)

# For real data with labels (data/{good,bad}/sample_id/):
train_ds = AudioDataset("data/train", cfg, labeled=True)
val_ds   = AudioDataset("data/val",   cfg, labeled=True)
```

### Step 3: DataLoader — standard PyTorch
```python
from torch.utils.data import DataLoader

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, num_workers=4)
val_loader   = DataLoader(val_ds,   batch_size=16, shuffle=False, num_workers=4)
```

### Step 4: Training loop
```python
for batch in train_loader:
    mel    = batch['audio']       # (B, 1, n_mels, T) — ready for CNN/transformer
    labels = batch['label']       # (B,) — integer class indices
    ids    = batch['sample_id']   # list of str — for debugging/logging

    logits = model(mel)
    loss   = criterion(logits, labels)
    ...
```

### Later: Multimodal merging
Each modality dataset returns dicts with `sample_id`. To merge:
```python
# Each modality dataset indexed by sample_id
# → MultimodalDataset joins audio + video + tabular by matching sample_id
# → returns {'audio': ..., 'video': ..., 'tabular': ..., 'label': ..., 'sample_id': ...}
```

In [6]:
# Quick demo: load one sample through the full pipeline
# labeled=False for sampleData (flat structure, no good/bad folders)
from audio_processing import AudioTransform, AudioDataset, DEFAULT_AUDIO_CFG

ds = AudioDataset(DATA_ROOT, DEFAULT_AUDIO_CFG, labeled=False)
sample = ds[0]

print(f"Dataset size:  {len(ds)}")
print(f"Sample ID:     {sample['sample_id']}")
print(f"Label:         {sample['label']} ({sample['label_name']})")
print(f"Mel shape:     {sample['audio'].shape}")
print(f"  → (channels={sample['audio'].shape[0]}, n_mels={sample['audio'].shape[1]}, time_frames={sample['audio'].shape[2]})")

Dataset size:  10
Sample ID:     08-17-22-0011-00
Label:         -1 (unlabeled)
Mel shape:     torch.Size([1, 40, 1897])
  → (channels=1, n_mels=40, time_frames=1897)
