In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 

In [4]:
# 🚩 0. Setup: Install & Import
!pip install -q datasets transformers peft accelerate evaluate soundfile

import shutil
from pathlib import Path
import torch
import evaluate
import numpy as np

from datasets import load_dataset
from transformers import (
    AutoProcessor,
    AutoModelForCTC,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_model, LoraConfig, TaskType

# 🚫 Clear Corrupted Hugging Face Cache
shutil.rmtree(Path.home() / ".cache/huggingface/datasets/mozilla-foundation", ignore_errors=True)
shutil.rmtree(Path.home() / ".cache/huggingface/datasets/downloads", ignore_errors=True)

# ✅ 1. Load Common Voice Arabic (1% for speed)
dataset = load_dataset("mozilla-foundation/common_voice_14_0", "ar", split="train[:1%]")

# 🧪 2. Preprocessing
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

def preprocess(batch):
    audio = batch["audio"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")
    batch["input_values"] = inputs.input_values[0]
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

dataset = dataset.cast_column("audio", dataset.features["audio"].clone())
dataset = dataset.map(preprocess, remove_columns=dataset.column_names)

# ⚙️ 3. Load Model + Apply PEFT (LoRA)
base_model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53")

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CTC
)

model = get_peft_model(base_model, peft_config)

# 📊 4. Compute WER Metric
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = torch.argmax(torch.tensor(pred_logits), dim=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# 🎯 5. Training Arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-arabic-peft",
    per_device_train_batch_size=4,
    evaluation_strategy="no",
    num_train_epochs=1,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)

# 🏋🏽 6. Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor,
    compute_metrics=compute_metrics
)

trainer.train()

# 💾 7. Save Model + Processor
model.save_pretrained("wav2vec2-arabic-peft")
processor.save_pretrained("wav2vec2-arabic-peft")


ConnectionError: Couldn't reach https://huggingface.co/datasets/mozilla-foundation/common_voice_14_0/resolve/main/common_voice_14_0.py (ConnectionError('Unauthorized for URL https://huggingface.co/datasets/mozilla-foundation/common_voice_14_0/resolve/main/common_voice_14_0.py. Please use the parameter `token=True` after logging in with `huggingface-cli login`'))