In [None]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Wav2Vec2 Model Loaded on:", device)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Wav2Vec2 Model Loaded on: cuda


In [3]:
import librosa
import json

# Function to preprocess audio
def preprocess_audio(audio_path):
    waveform, _ = librosa.load(audio_path, sr=16000)
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    return inputs.input_values.squeeze(0)  # Remove batch dim

# Load dataset
with open("D:/Speech_recognition/transcriptions.json", "r") as f:
    transcriptions = json.load(f)


In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class ASRDataset(Dataset):
    def __init__(self, transcriptions):
        self.audio_paths = list(transcriptions.keys())
        self.transcripts = list(transcriptions.values())

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        input_values = preprocess_audio(self.audio_paths[idx])

        # text transcription into tokenized labels (numerical tensor)
        labels = processor(text=[self.transcripts[idx]], return_tensors="pt", padding=True, truncation=True).input_ids
        labels = labels.squeeze(0).to(torch.long)  

        return {
            "input_values": input_values,
            "labels": labels,
        }



train_dataset = ASRDataset(transcriptions)

# Collate function for padding
def collate_fn(batch):
    input_values = [item["input_values"] for item in batch]
    labels = [item["labels"] for item in batch] 

    # Pad audio inputs with zeros
    input_values = pad_sequence(input_values, batch_first=True, padding_value=0)

    # Pad labels with -100 (special value ignored by CTC loss)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"input_values": input_values, "labels": labels}

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

print("✅ DataLoader Ready with", len(train_loader), "batches")


✅ DataLoader Ready with 371 batches


In [None]:
from torch.optim import AdamW  


# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)
scaler = torch.amp.GradScaler("cuda") if torch.cuda.is_available() else None  # ✅ NEW

# Training loop
model.train()
batch_count = 0
max_batches_per_epoch = 200 

for epoch in range(2):  
    batch_count = 0  
    for batch in train_loader:
        if batch_count >= max_batches_per_epoch:
            break  

        batch_count += 1  
        optimizer.zero_grad()

        input_values = batch["input_values"].to(device)
        labels = batch["labels"].to(device)

        if scaler:
            with torch.amp.autocast("cuda"):  
                outputs = model(input_values, labels=labels)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(input_values, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        print(f"Batch: {batch_count}   Loss= {loss.item()}")  

    print(f"✅ Epoch {epoch+1} Completed (Trained {batch_count} batches)")

print("✅ Training Complete!")



Batch: 1   Loss= 245.63363647460938
Batch: 2   Loss= 219.0860595703125
Batch: 3   Loss= 149.3104248046875
Batch: 4   Loss= 273.1280517578125
Batch: 5   Loss= 102.82756042480469
Batch: 6   Loss= 181.08494567871094
Batch: 7   Loss= 185.3128204345703
Batch: 8   Loss= 208.5411376953125
Batch: 9   Loss= 236.97364807128906
Batch: 10   Loss= 206.27926635742188
Batch: 11   Loss= 198.99078369140625
Batch: 12   Loss= 224.82431030273438
Batch: 13   Loss= 229.7213592529297
Batch: 14   Loss= 162.8397979736328
Batch: 15   Loss= 252.51834106445312
Batch: 16   Loss= 281.41729736328125
Batch: 17   Loss= 171.29432678222656
Batch: 18   Loss= 303.90081787109375
Batch: 19   Loss= 315.43109130859375
Batch: 20   Loss= 206.69808959960938
Batch: 21   Loss= 372.7890319824219
Batch: 22   Loss= 195.49945068359375
Batch: 23   Loss= 157.0379638671875
Batch: 24   Loss= 140.6300048828125
Batch: 25   Loss= 151.03475952148438
Batch: 26   Loss= 230.36160278320312
Batch: 27   Loss= 148.67669677734375
Batch: 28   Loss= 12

In [None]:
model.to("cpu")

# Save model & processor
model.save_pretrained("D:/Speech_recognition/wav2vec2_finetuned")
processor.save_pretrained("D:/Speech_recognition/wav2vec2_finetuned")

print("✅ Model Saved Successfully!")


✅ Model Saved Successfully!
