In [1]:
# Load model directly
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h")
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-large-960h")

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

In [2]:
model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [3]:
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from peft import get_peft_model, LoraConfig, TaskType
from torchinfo import summary

# Load pre-trained Wav2Vec2 model
# model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of the LoRA matrices
    lora_alpha=16,  # Scaling factor
    target_modules=["lm_head"],  # Only apply LoRA to the classifier (lm_head)
    lora_dropout=0.1,  # Dropout for LoRA layers
)

# Apply PEFT (LoRA) to the model
peft_model = get_peft_model(model, lora_config)

# Check the new model architecture with PEFT applied
summary(peft_model)



Layer (type:depth-idx)                                            Param #
PeftModel                                                         --
├─LoraModel: 1-1                                                  --
│    └─Wav2Vec2ForCTC: 2-1                                        --
│    │    └─Wav2Vec2Model: 3-1                                    (315,428,992)
│    │    └─Dropout: 3-2                                          --
│    │    └─Linear: 3-3                                           41,248
Total params: 315,470,240
Trainable params: 8,448
Non-trainable params: 315,461,792

# DESCRIPTION
Experiment with MIT's AST (Audio Spectrogram Transformer) for UAV Classification. 

In [4]:
from AST_helper.util import AudioDataset, train_test_split_custom
from AST_helper.engine import train, inference_loop
from AST_helper.model import auto_extractor, custom_AST
from AST_helper.util import save_model # noqa: F401

import torch
from torch.utils.data import DataLoader
import torch.optim
import torch.nn as nn
from torchinfo import summary

import wandb

device = "cuda" if torch.cuda.is_available() else "cpu"
display(device)

'cuda'

In [5]:
data_path = "C:/Users/Sidewinders/Research_notebooks/Drone_classification/Research/UAV_Dataset_9"
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
BATCH_SIZE = 4
SEED = 42
EPOCHS = 2
NUM_CUDA_WORKERS = 0
PINNED_MEMORY = True
SHUFFLED = True
ACCUMULATION_STEPS = 2 # multiplies by batch size for large batch size effect.
OPTIM_LR = 0.0001
TRAIN_PATIENCE = 5
multiple_runs = False
wandb_init = False
SAVE_MODEL = True

torch.cuda.empty_cache()


config = {
        "learning_rate": OPTIM_LR,
        "batch_size": BATCH_SIZE,
        "num_epochs": EPOCHS,
        "random_seed" : SEED,
        "optimizer": "AdamW",
        "loss_function": "CrossEntropyLoss"
    }
wandb_params = {
        "project": "vanilla_AST",
        "name": "classifier_grad_true_lowerLR",
        "reinit": False,
        "notes" : "8457 trainable params",
        "tags": ["AST"],
        "config": config
    }

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [6]:
# feature_extractor = auto_extractor(model_name)

dataset_0 = AudioDataset(data_path, processor)
shape = dataset_0[0][0].shape

train_subset, test_subset, inference_subset = train_test_split_custom(dataset_0, test_size=0.2, inference_size=0.1) 
num_classes = len(dataset_0.get_classes()) 

model = peft_model.to(device)
# model = custom_AST(model_name, num_classes, device)

summary(model,
        col_names=["num_params","trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                                                          Param #              Trainable
PeftModel (PeftModel)                                                            --                   Partial
├─LoraModel (base_model)                                                         --                   Partial
│    └─Wav2Vec2ForCTC (model)                                                    --                   Partial
│    │    └─Wav2Vec2Model (wav2vec2)                                             (315,428,992)        False
│    │    └─Dropout (dropout)                                                    --                   --
│    │    └─Linear (lm_head)                                                     41,248               Partial
Total params: 315,470,240
Trainable params: 8,448
Non-trainable params: 315,461,792

In [7]:
train_dataloader_custom = DataLoader(dataset=train_subset, 
                                     batch_size=BATCH_SIZE,
                                     num_workers=NUM_CUDA_WORKERS,
                                     pin_memory=PINNED_MEMORY,
                                     shuffle=SHUFFLED)

test_dataloader_custom = DataLoader(dataset=test_subset,
                                    batch_size=BATCH_SIZE, 
                                    num_workers=NUM_CUDA_WORKERS,
                                    pin_memory=PINNED_MEMORY,
                                    shuffle=SHUFFLED)

if inference_subset: # may not be defined
    inference_dataloader_custom = DataLoader(dataset=inference_subset,
                                    batch_size=BATCH_SIZE, 
                                    num_workers=NUM_CUDA_WORKERS,
                                    pin_memory=PINNED_MEMORY,
                                    shuffle=SHUFFLED) 

In [8]:
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=OPTIM_LR)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3) #TODO experiment w/ diff hyperparams

In [9]:
if wandb_init:
    wandb.init(
            project=wandb_params.get("project"),
            config=wandb_params.get("config"),
            name=wandb_params.get("name"),
            reinit=wandb_params.get("reinit", True),
            tags=wandb_params.get("tags", []),
            notes=wandb_params.get("notes", ""),
            dir=wandb_params.get("dir", None)
        )

In [10]:
results = train(model,
                train_dataloader=train_dataloader_custom,
                test_dataloader=test_dataloader_custom,
                optimizer=optimizer,
                scheduler=scheduler,
                loss_fn=loss_fn,
                epochs=EPOCHS,
                device=device,
                accumulation_steps=ACCUMULATION_STEPS,
                patience=TRAIN_PATIENCE)


  0%|          | 0/2 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
inference_loop(model=model,
               device=device,
               loss_fn=loss_fn,
               inference_loader= inference_dataloader_custom)



if not multiple_runs and wandb_init:
    wandb.finish()

  with torch.cuda.amp.autocast():


Inference Loss: 0.4719, Accuracy: 100.00%


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
inference_accuracy,▁
inference_loss,▁
test_acc,▁▂▄▅▇▇▇▇▇▇██████████
test_f1,▁▂▄▅▇▇▇▇▇███████████
test_loss,█▇▆▅▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁
test_precision,▁▃▄▆▇▇▇▇▇▇██████████
test_recall,▁▂▄▅▇▇▇▇▇▇██████████
train_acc,▁▂▄▅▆▇▇▇▇███████████
train_f1,▁▂▄▆▆▇▇▇▇███████████

0,1
epoch,20.0
inference_accuracy,1.0
inference_loss,0.47188
test_acc,0.91797
test_f1,0.9145
test_loss,0.56277
test_precision,0.92188
test_recall,0.91358
train_acc,0.89656
train_f1,0.89813


In [None]:
if SAVE_MODEL:
    save_model(model=model,
            target_dir="saved_models",
            model_name="AST_classifier_true.pt")

[INFO] Saving model to: saved_models\AST_classifier_true.pt
