In [23]:
# ----------------------------
# Prepare training data from Metadata file
# ----------------------------
import pandas as pd
from pathlib import Path

download_path  = Path('data/UrbanSound8K/UrbanSound8K')

# Read metadata file
metadata_file = download_path/'metadata'/'UrbanSound8K.csv'
df = pd.read_csv(metadata_file)
df.head()

# Construct file path by concatenating fold and file name
df['relative_path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)

# Take relevant columns
df = df[['relative_path', 'classID']]
df.head()

Unnamed: 0,relative_path,classID
0,/fold5/100032-3-0-0.wav,3
1,/fold5/100263-2-0-117.wav,2
2,/fold5/100263-2-0-121.wav,2
3,/fold5/100263-2-0-126.wav,2
4,/fold5/100263-2-0-137.wav,2


In [None]:
import math
import random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class AudioUtil:
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud
        if (sig.shape[0] == new_channel):
            return aud
        if (new_channel == 1):
            resig = sig[:1, :]
        else:
            resig = torch.cat([sig, sig])
        return ((resig, sr))
     # ----------------------------
  # Generate a Spectrogram
  # ----------------------------
@staticmethod
def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)


   


In [None]:
import os
from torch.utils.data import Dataset, random_split
import torchaudio

class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.shift_pct = 0.4
            
    def __len__(self):
        return len(self.df)    
    
    def __getitem__(self, idx):
        relative_path = self.df.loc[idx, 'relative_path'].lstrip('/')
        audio_file = os.path.join(self.data_path, 'audio', relative_path)

        if not os.path.exists(audio_file):
            raise FileNotFoundError(f"Audio file not found: {audio_file}")

        class_id = int(self.df.loc[idx, 'classID'])
        aud = AudioUtil.open(audio_file)
        rechan = AudioUtil.rechannel(aud, self.channel)
        sgram = AudioUtil.spectro_gram(rechan, n_mels=64, n_fft=1024, hop_len=None)
        signal, sr = sgram

        return {
            'audio': {
                'path': f'/audio/{relative_path}',
                'array': signal.squeeze().numpy(),  # Convert tensor to NumPy array
                'sampling_rate': sr
            },
            'labels': class_id
        }
    
def preprocess_audio(sample, target_sr=16000, feature_extractor=None):

    audio = sample['audio']  # Expecting tensor
    # Convert to mono if stereo (average channels)
    if audio.dim() > 1 and audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0)

    extractor_sr = getattr(feature_extractor, 'sampling_rate', 16000) if feature_extractor else 16000
    # Resample to target sampling rate
    if extractor_sr != target_sr:
        resampler = torchaudio.transforms.Resample(orig_freq=target_sr, new_freq=feature_extractor.sampling_rate)
        audio = resampler(audio)

        # Apply feature extractor if available, otherwise return raw audio
    if feature_extractor:
        try:
            features = feature_extractor(audio.numpy(), sampling_rate=extractor_sr, return_tensors="pt")
            return {'audio': features['input_values'].squeeze(0)}  # Remove batch dimension
        except Exception as e:
            print(f"Error applying feature extractor: {e}")
            return {'audio': audio}  # Fallback to raw audio
    else:
        return {'audio': audio}

In [38]:
myds = SoundDS(df, download_path)

In [39]:
print(myds[0])

{'audio': {'path': '/audio/fold5/100032-3-0-0.wav', 'array': array([[-4.5776367e-03, -4.8828125e-03, -4.6081543e-03, ...,
        -5.7983398e-04, -4.2724609e-04,  3.0517578e-05],
       [-4.5166016e-03, -4.7912598e-03, -4.6081543e-03, ...,
        -7.3242188e-04, -5.4931641e-04, -3.0517578e-05]],
      shape=(2, 14004), dtype=float32), 'sampling_rate': 44100}, 'labels': 3}


In [40]:
from transformers import ASTFeatureExtractor

pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)

model_input_name = feature_extractor.model_input_names[0]
SAMPLING_RATE = feature_extractor.sampling_rate

In [41]:
from torch.utils.data import random_split
import torch
import numpy as np

# Assuming SoundDS, df, download_path, feature_extractor, and preprocess_audio are defined
myds = SoundDS(df, download_path)

# Calculate values for normalization
feature_extractor.do_normalize = False  # Disable normalization to calculate mean and std
mean = []
std = []

# Apply preprocess_audio to the audio array
for i in range(len(myds)):
    sample = myds[i]
    audio_array = torch.from_numpy(sample['audio']['array'])  # Convert NumPy array back to tensor
    # Assuming preprocess_audio takes the audio tensor and returns a processed tensor
    processed_audio = preprocess_audio({'audio': audio_array})['audio']
    cur_mean = torch.mean(processed_audio)
    cur_std = torch.std(processed_audio)
    mean.append(cur_mean.item())
    std.append(cur_std.item())

# Set feature extractor normalization parameters
feature_extractor.mean = np.mean(mean)
feature_extractor.std = np.mean(std)
feature_extractor.do_normalize = True

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./ast_model_output",
    eval_strategy="epoch",  # Changed from evaluation_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,  # Matches your DataLoader
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Define a compute_metrics function for evaluation
from datasets import load_metric
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Initialize Trainer
trainer = Trainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [42]:
from audiomentations import Compose, AddGaussianSNR, GainTransition, Gain, ClippingDistortion, TimeStretch, PitchShift

audio_augmentations = Compose([
    AddGaussianSNR(min_snr_db=10, max_snr_db=20),
    Gain(min_gain_db=-6, max_gain_db=6),
    GainTransition(min_gain_db=-6, max_gain_db=6, min_duration=0.01, max_duration=0.3, duration_unit="fraction"),
    ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=30, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.2),
    PitchShift(min_semitones=-4, max_semitones=4),
], p=0.8, shuffle=True)

In [43]:
from transformers import ASTConfig, ASTForAudioClassification

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)
# Update configuration with the number of labels in our dataset
config.num_labels = num_labels
config.label2id = label2id
config.id2label = {v: k for k, v in label2id.items()}
# Initialize the model with the updated configuration
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config, ignore_mismatched_sizes=True)
model.init_weights()

NameError: name 'num_labels' is not defined