<a href="https://colab.research.google.com/github/1pawn0/persian-speech-to-text-via-whisper/blob/main/persian_speech_to_text_via_whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip -q install torchcodec

In [None]:
import torch, torchaudio, torchcodec
from pathlib import Path
from transformers import (
    BatchFeature,
    WhisperConfig,
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperForConditionalGeneration,
)

device = "cuda"


In [None]:
MODEL_NAME = "nezamisafa/whisper-v3-turbo-persian-v1.0"  # @param {"type":"string","placeholder":"openai/whisper-large-v3-turbo"}
model_config = WhisperConfig.from_pretrained(MODEL_NAME)
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, config=model_config).to(device)


In [None]:
def split_audio_into_chunks(audio_file_path: Path, chunk_duration_sec: int = 30, target_sample_rate: int = 16000) -> list[torch.Tensor]:
    waveform, orig_sample_rate = torchaudio.load_with_torchcodec(audio_file_path)

    if orig_sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    chunk_samples = chunk_duration_sec * target_sample_rate
    total_samples = waveform.shape[1]

    chunks: list[torch.Tensor] = []
    for start in range(0, total_samples, chunk_samples):
        end = start + chunk_samples
        chunk = waveform[0, start:end]

        if chunk.shape[0] < chunk_samples:
            padding = chunk_samples - chunk.shape[0]
            chunk = torch.nn.functional.pad(chunk, (0, padding), mode="constant", value=0.0)

        chunks.append(chunk)

    return chunks


In [None]:
def generate_transcription(audio_chunks: list[torch.Tensor], generated_file_path: Path):
    f = open(generated_file_path, "w", encoding="utf-8")
    for chunk in audio_chunks:
        features: BatchFeature = feature_extractor(
            raw_speech=chunk,
            sampling_rate=16000,
            return_tensors="pt",
            do_normalize=True,
            return_attention_mask=True,
            device=device,
        ).to(device)

        pred_ids: torch.Tensor = model.generate(
            input_features=features.input_features.to(device),
            attention_mask=features.attention_mask.to(device),
            language="fa",
            task="transcribe",
            do_sample=False,
            temperature=0.0,
        )
        text = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0]
        f.write(text)
    f.close()


In [None]:
def transcribe_speech(audio_file_path: Path) -> Path:
    audio_chunks: list[torch.Tensor] = split_audio_into_chunks(audio_file_path)

    transcription_file_path: Path = Path(f"{audio_file_path.stem}.txt")
    generate_transcription(audio_chunks, transcription_file_path)
    return transcription_file_path


speech_audio_file_path: str = "path/to/speech/audio.mp3"  # @param {"type":"string","placeholder":"path/to/speech/audio.mp3"}
transcribe_speech(Path(speech_audio_file_path))
