In [None]:
!uv pip install faster-whisper

In [None]:
from faster_whisper import WhisperModel

model_size = "small"

# Run on GPU with FP16
# model = WhisperModel(model_size, device="cuda", compute_type="float16")

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
model = WhisperModel(model_size, device="cpu", compute_type="int8")

In [None]:
from faster_whisper import BatchedInferencePipeline
pipe = BatchedInferencePipeline(model)
segments, info = pipe.transcribe("sample.wav", beam_size=5, task='transcribe', batch_size=8, language='zh')

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("id=%s [%s -> %s] %s" % (segment.id, segment.start, segment.end, segment.text))
    if segment.words:
        for word in segment.words:
            print("  [%s -> %s] %s" % (word.start, word.end, word.word))

In [None]:
from pydantic import BaseModel
from typing import Optional, Tuple, Literal, List, Generator
class TranscriptionSegment(BaseModel):
    """Model for a single segment of transcription."""
    id: Optional[int] = None
    text: str
    timestamp: Tuple[float, Optional[float]]


class TranscriptionRequest(BaseModel):
    """Model for transcription request parameters."""
    model_id: str
    task: Literal["transcribe", "translate"]
    language: str = "Automatic Detection"
    chunk_length: int = 30
    batch_size: int = 24
    filepath: str


class TranscriptionResponse(BaseModel):
    """Model for transcription API response."""
    text: str
    segments: List[TranscriptionSegment]

In [None]:
def parse_transcription(data: Generator) -> TranscriptionResponse:
    segments = []
    full_text_parts = []  # Collect text parts for concatenation

    for segment in data:
        segments.append(
            TranscriptionSegment(
                id=segment.id,
                text=segment.text,
                timestamp=(segment.start, segment.end),
            )
        )
        full_text_parts.append(segment.text)  # Collect text for full text

    # Concatenate all segment texts with a space separator
    full_text = " ".join(full_text_parts)

    return TranscriptionResponse(
        text=full_text,
        segments=segments
    )

In [None]:
results = parse_transcription(segments)

In [None]:
results