In [6]:
from transformers import Wav2Vec2Processor
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset, Audio
from transformers import Wav2Vec2ForCTC, Trainer, TrainingArguments
import pandas as pd

class DataCollatorCTCWithPadding:
    def __init__(self, processor: Wav2Vec2Processor, padding: bool = True):
        self.processor = processor
        self.padding = padding

    def __call__(self, features):
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        
        # Pad inputs
        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        # Pad labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(label_features, padding=self.padding, return_tensors="pt")
        # Replace padding with -100 for CTC loss
        labels_batch["input_ids"] = labels_batch["input_ids"].masked_fill(
            labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100)
        
        batch["labels"] = labels_batch["input_ids"]
        return batch


In [7]:
# Load the Excel file
df = pd.read_excel("F:/thesis/Features/Final/Updated/transcriptions_final.xlsx")

# Ensure that the 'text' column is of type str
df['text'] = df['text'].astype(str)

# Split the dataset into training and testing sets (80% training, 20% testing)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert the DataFrame to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Cast the 'audio' column to the Audio feature type
train_dataset = train_dataset.map(lambda batch: {"audio": batch["Link"]})
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))

test_dataset = test_dataset.map(lambda batch: {"audio": batch["Link"]})
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))


Map:   0%|          | 0/6635 [00:00<?, ? examples/s]

Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

In [15]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Path to the local directory where the model files are stored
local_model_path = "wav2vec2-large-xls-r-300m-Urdu"  # Replace with your local directory

# Load the processor (feature extractor and tokenizer)
processor = Wav2Vec2Processor.from_pretrained(local_model_path)

# Load the Wav2Vec2 model
model = Wav2Vec2ForCTC.from_pretrained(local_model_path)


OSError: Incorrect path_or_model_id: 'F:/thesis/xlsr2_300m'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [4]:
# Preprocess the dataset
def preprocess(batch):
    audio = batch["audio"]["array"]
    batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]

    transcription = batch["text"]
    if isinstance(transcription, str):
        with processor.as_target_processor():
            batch["labels"] = processor(transcription).input_ids
    else:
        batch["labels"] = []
    
    return batch

# Apply the preprocessing to the datasets
train_dataset = train_dataset.map(preprocess, remove_columns=["Link", "text"])
test_dataset = test_dataset.map(preprocess, remove_columns=["Link", "text"])

# Define Data Collator
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Map:   0%|          | 0/6635 [00:00<?, ? examples/s]



Map:   0%|          | 0/1659 [00:00<?, ? examples/s]

In [5]:
# Define evaluation metric (Word Error Rate)
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = torch.argmax(pred_logits, dim=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Load the model
model = Wav2Vec2ForCTC.from_pretrained(
    "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True, 
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of the model checkpoint at kingabzpro/wav2vec2-large-xls-r-300m-Urdu were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kingabzpro/wav2vec2-large-xls-r-300m-Urdu and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You shoul

In [6]:
# Freeze the first few layers of the Wav2Vec2 model
for param in model.wav2vec2.feature_extractor.parameters():
    param.requires_grad = False  # Freezing the feature extractor layers

# Optionally, you can freeze more layers (e.g., the first transformer layers)
for i, layer in enumerate(model.wav2vec2.encoder.layers):
    if i < 6:  # Freeze the first 6 transformer layers
        for param in layer.parameters():
            param.requires_grad = False


In [8]:
# Training arguments with gradient accumulation
training_args = TrainingArguments(
    output_dir="./wav2vec2-urdu",
    group_by_length=True,
    per_device_train_batch_size=2,
    #gradient_accumulation_steps=2,  # Accumulate gradients every 2 steps
    eval_strategy="steps",
    num_train_epochs=5,
    fp16=True,  # Enable if using GPU with mixed precision
    save_steps=400,
    eval_steps=400,
    logging_steps=400,
    learning_rate=3e-4,
    warmup_steps=500,
)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Trainer setup
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,  # Use the training dataset
    eval_dataset=test_dataset,    # Use the testing dataset
    tokenizer=processor.feature_extractor,
)

In [10]:
# Fine-tune the model
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 184.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 3.16 GiB is allocated by PyTorch, and 245.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# Save the model
trainer.save_model("./wav2vec2-urdu")
processor.save_pretrained("./wav2vec2-urdu")