## Dataset preparation

In [23]:
import datasets
from datasets import load_dataset

from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor

import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union, Optional,Tuple

from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import WhisperForConditionalGeneration

In [24]:
 # Load json files for test split
data_files = {
    "test": "dataset/CodeSwitched_Data/test.json"
}
dataset = load_dataset("json", data_files=data_files)

# Update the audio paths to include appropriate folder-name
def prepend_folder_name(row):
    row["audio"] = 'dataset/CodeSwitched_Data/' + row["audio"]
    return row
for key in dataset:
    dataset[key] = dataset[key].map(prepend_folder_name)

# Cast columns to appropriate features
features = datasets.Features(
    {
        "id": datasets.Value("string"),
        "transcription": datasets.Value("string"),
        "audio": datasets.Audio(sampling_rate=16000),
    }
)
dataset = dataset.map(features.encode_example, features=features)

In [25]:
# Load necessary processors
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Hindi", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [26]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]
    # encode target text to label ids
    batch["labels"] = tokenizer(batch["transcription"]).input_ids
    return batch
dataset = dataset.map(prepare_dataset, num_proc=2)

In [27]:
torch.cuda.set_device(4) 

## Task 2.1: Zero shot whisper greedy decoding

> In this task, you will run a standard greedy decoding of the test utterances using the pretrained Whisper small model.



In [28]:
import json
import numpy as np
from evaluate import load
import whisper
import tqdm

In [29]:
device = torch.device("cuda:4")

## Task 2.2: Constrained Filtering-based Greedy Decoding

In [30]:
from whisper.decoding import DecodingOptions, DecodingResult, DecodingTask, GreedyDecoder, TokenDecoder, Inference
from dataclasses import dataclass, field, replace
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union

import numpy as np
import torch.nn.functional as F
from torch import Tensor
from torch.distributions import Categorical

from whisper.audio import CHUNK_LENGTH
from whisper.tokenizer import Tokenizer, get_tokenizer
from whisper.utils import compression_ratio

if TYPE_CHECKING:
    from whisper.model import Whisper

In [31]:
"""
TODO: Define helper functions to perform constrained filtering here, if needed.
"""
import torch
from typing import Tuple

def sample_batch(logits: torch.Tensor, N: int = 10, p: float = 0.9) -> Tuple[torch.Tensor, torch.Tensor]:
    """
      Sample the next best token for every example within a batch, using constrained filtering.
    """
    # TODO: Implement this function
    # TODO: You can create any helper functions that sample_batch needs within this cell
    soft_logits = F.softmax(logits)
    values, indices = torch.topk(soft_logits.reshape(-1), N)
    
    sum = 0
    for i, value in enumerate(values):
       sum += value
       if (sum >= p):
         values = values[ :i+1]
         indices = indices[ :i+1]

    
    distribution = torch.distributions.Categorical(values)
    
    return torch.tensor([indices[distribution.sample()]]).to(device)


In [32]:
import os
import traceback
import warnings

from whisper.audio import (
    FRAMES_PER_SECOND,
    HOP_LENGTH,
    N_FRAMES,
    N_SAMPLES,
    SAMPLE_RATE,
    log_mel_spectrogram,
    pad_or_trim,
)
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
from whisper.utils import (
    exact_div,
    format_timestamp,
    get_writer,
    make_safe,
    optional_float,
    optional_int,
    str2bool,
)

In [33]:
"""
Custom classes to integrate constrained filtering within greedy decoding.
"""

class CustomGreedyDecoder(GreedyDecoder):
    """
    Updates the existing `GreedyDecoder` class form whisper to use constrained_filtering
    """
    def __init__(self, temperature: float, eot: int):
        super().__init__(temperature, eot)

    def update(
        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
    ) -> Tuple[Tensor, bool]:
        """
        Updates the token list `tokens` to add a next best token using constrained_filtering
        """
        # TODO: sample_batch should be modified to implement constrained_filtering

        next_tokens = sample_batch(logits)
        
        logprobs = F.log_softmax(logits.float(), dim=-1)
        current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
        sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)  

        next_tokens[tokens[:, -1] == self.eot] = self.eot
        tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)

        completed = (tokens[:, -1] == self.eot).all()
        return tokens, completed

class CustomDecodingTask(DecodingTask):
    """
    Updates the existing `DecodingTask` class form whisper to incorporate constrained_filtering
    """
    def __init__(self, model: "Whisper", options: DecodingOptions):
        super().__init__(model,options)
        self.decoder = CustomGreedyDecoder(
            self.options.temperature, self.tokenizer.eot
        )
@torch.no_grad()
def custom_decode(
    model: torch.nn.Module,
    mel: torch.Tensor,
    options: DecodingOptions = DecodingOptions(),

    **kwargs,
) -> Union[DecodingResult, List[DecodingResult]]:
    """
    decode function to perform constrained_filtering based greedy decoding
    """
    single = mel.ndim
    if single == 2:
        mel = mel.unsqueeze(0)

    if kwargs:
        options = replace(options, **kwargs)

    result = CustomDecodingTask(model, options).run(mel)

    return result[0] if single else result

def custom_transcribe(
    model: "Whisper",
    audio: Union[str, np.ndarray, torch.Tensor],
    *,
    compression_ratio_threshold: Optional[float] = 2.4,
    logprob_threshold: Optional[float] = -1.0,
    condition_on_previous_text: bool = True,
    prepend_punctuations: str = "\"'“¿([{-",
    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
    **decode_options,
):
    dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
    if model.device == torch.device("cpu"):
        if torch.cuda.is_available():
            warnings.warn("Performing inference on CPU when CUDA is available")
        if dtype == torch.float16:
            warnings.warn("FP16 is not supported on CPU; using FP32 instead")
            dtype = torch.float32

    if dtype == torch.float32:
        decode_options["fp16"] = False

    # Pad 30-seconds of silence to the input audio, for slicing
    mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES)
    content_frames = mel.shape[-1] - N_FRAMES

    language: str = decode_options["language"]
    task: str = decode_options.get("task", "transcribe")
    tokenizer = get_tokenizer(
        model.is_multilingual,
        num_languages=model.num_languages,
        language=language,
        task=task,
    )
    
    def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
        decode_result = None

        temp = 0.0
        kwargs = {**decode_options}
        kwargs.pop("best_of", None)

        options = DecodingOptions(**kwargs, temperature=temp)
        decode_result = custom_decode(model, segment, options)

        return decode_result

    seek = 0
    input_stride = exact_div(
        N_FRAMES, model.dims.n_audio_ctx
    )  # mel frames per output token: 2
    time_precision = (
        input_stride * HOP_LENGTH / SAMPLE_RATE
    )  # time per output token: 0.02 (seconds)
    all_tokens = []
    all_segments = []

    prompt_reset_since = 0
    initial_prompt_tokens = []

    def new_segment(
        *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
    ):
        tokens = tokens.tolist()
        text_tokens = [token for token in tokens if token < tokenizer.eot]
        return {
            "seek": seek,
            "start": start,
            "end": end,
            "text": tokenizer.decode(text_tokens),
            "tokens": tokens,
            "temperature": result.temperature,
            "avg_logprob": result.avg_logprob,
            "compression_ratio": result.compression_ratio,
            "no_speech_prob": result.no_speech_prob,
        }

    last_speech_timestamp = 0.0
    while seek < content_frames:
        time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
        mel_segment = mel[:, seek : seek + N_FRAMES]
        segment_size = min(N_FRAMES, content_frames - seek)
        segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
        mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)

        decode_options["prompt"] = all_tokens[prompt_reset_since:]
        result: DecodingResult = decode_with_fallback(mel_segment)
        tokens = torch.tensor(result.tokens)

        previous_seek = seek
        current_segments = []

        timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
        single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]

        consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
        consecutive.add_(1)
        if len(consecutive) > 0:
            # if the output contains two consecutive timestamp tokens
            slices = consecutive.tolist()
            if single_timestamp_ending:
                slices.append(len(tokens))

            last_slice = 0
            for current_slice in slices:
                sliced_tokens = tokens[last_slice:current_slice]
                start_timestamp_pos = (
                    sliced_tokens[0].item() - tokenizer.timestamp_begin
                )
                end_timestamp_pos = (
                    sliced_tokens[-1].item() - tokenizer.timestamp_begin
                )
                current_segments.append(
                    new_segment(
                        start=time_offset + start_timestamp_pos * time_precision,
                        end=time_offset + end_timestamp_pos * time_precision,
                        tokens=sliced_tokens,
                        result=result,
                    )
                )
                last_slice = current_slice

            if single_timestamp_ending:
                # single timestamp at the end means no speech after the last timestamp.
                seek += segment_size
            else:
                # otherwise, ignore the unfinished segment and seek to the last timestamp
                last_timestamp_pos = (
                    tokens[last_slice - 1].item() - tokenizer.timestamp_begin
                )
                seek += last_timestamp_pos * input_stride
        else:
            duration = segment_duration
            timestamps = tokens[timestamp_tokens.nonzero().flatten()]
            if (
                len(timestamps) > 0
                and timestamps[-1].item() != tokenizer.timestamp_begin
            ):
                # no consecutive timestamps but it has a timestamp; use the last one.
                last_timestamp_pos = (
                    timestamps[-1].item() - tokenizer.timestamp_begin
                )
                duration = last_timestamp_pos * time_precision

            current_segments.append(
                new_segment(
                    start=time_offset,
                    end=time_offset + duration,
                    tokens=tokens,
                    result=result,
                )
            )
            seek += segment_size

        # if a segment is instantaneous or does not contain text, clear it
        for i, segment in enumerate(current_segments):
            if segment["start"] == segment["end"] or segment["text"].strip() == "":
                segment["text"] = ""
                segment["tokens"] = []
                segment["words"] = []

        all_segments.extend(
            [
                {"id": i, **segment}
                for i, segment in enumerate(
                    current_segments, start=len(all_segments)
                )
            ]
        )
        all_tokens.extend(
            [token for segment in current_segments for token in segment["tokens"]]
        )

        if not condition_on_previous_text or result.temperature > 0.5:
              # do not feed the prompt tokens if a high temperature was used
              prompt_reset_since = len(all_tokens)

    return dict(
        text=tokenizer.decode(all_tokens),
        segments=all_segments,
        language=language,
    )

In [34]:
import re

# Function to ignore tags in transcripts
def remove_tags(input_string):
    tag_pattern = re.compile(r'<[^>]+>')
    cleaned_string = re.sub(tag_pattern, '', input_string)
    return cleaned_string

## Task 2.4: Constrained Beam Search Decoding on finetuned Whisper model



In [35]:
model = whisper.load_model("whisper-small-finetuned.pt")

In [36]:
# def decode_sequence(sequence):
#     tokenizer = get_tokenizer(model.is_multilingual, num_languages=model.num_languages, language="en",task="transcribe",)
#     decoded = tokenizer.decode(sequence)
    
    # if decoded==None:

In [37]:
def is_english_token(token): # token -> index of the token
    # Returns whether the token is an english token
    tokenizer = get_tokenizer(model.is_multilingual, num_languages=model.num_languages, language="en",task="transcribe",)
    decoded = tokenizer.decode([token])
    is_englis = True
    if len(decoded)==0 or decoded==None:
        return False
    for j in decoded:
        is_englis = is_englis and ((ord(j) >= 97 and ord(j) <= 122) or (ord(j) >= 65 and ord(j) <= 90))
    
    return is_englis

In [38]:
def is_english_list(seq):
    for i in seq[3:]:
        if (is_english_token(i)):
            return True
    return False

In [39]:
english_tokens = [token for token in range(51865) if is_english_token(token)]

In [40]:
from whisper.decoding import BeamSearchDecoder

"""
Custom classes to integrate constraints within beam search decoding.
"""
class CustomBeamSearchDecoder(BeamSearchDecoder):
    """
    Updates the existing `BeamSearchDecoder` class form whisper to use constrained beam search
    """
    def __init__(
        self,
        beam_size: int,
        eot: int,
        inference: Inference,
        cutoff: int = 3,
        patience: Optional[float] = None,
    ):
        super().__init__(beam_size, eot, inference, patience)
        # TODO: Add any additional variables you need for constrained beam search here
        self.max_candidates = beam_size // 2
        self.hahaha=False
        
    def update(
        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
    ) -> Tuple[Tensor, bool]:
        """
        TODO: This is the main routine that implements constrained beam search
        Updates the token list `tokens` to add a next best token using constrained beam search
        Refer to https://github.com/openai/whisper/blob/main/whisper/decoding.py#L301-L404 for the original update function
        """
        
        

        if self.finished_sequences is None:  # for the first update
            self.finished_sequences = [{}]
        
        logprobs = F.log_softmax(logits.float(), dim=-1)
        next_tokens, source_indices, finished_sequences = [], [], []
        
        scores, sources, finished = {}, {}, {} ## l2
        eng_scores, eng_sources = {}, {} ##l1 union l2
        
    
        ## For the case of general_beam
        for i in range(self.beam_size // 2):
            idx = i
            prefix = tokens[idx].tolist()
            
            ## l1 list
            for logprob, token in zip(*(logprobs[idx][english_tokens]).topk(self.beam_size//2 + 1)):
                new_logprob = (sum_logprobs[idx] + logprob).item()
                eng_sequence = tuple(prefix + [english_tokens[token.item()]])
                eng_scores[eng_sequence] = new_logprob
                eng_sources[eng_sequence] = idx
            
            ## l3 list
            for logprob, token in zip(*logprobs[idx].topk(self.beam_size//2 + 1)):    
                new_logprob = (sum_logprobs[idx] + logprob).item()
                sequence = tuple(prefix + [token.item()])
                scores[sequence] = new_logprob
                sources[sequence] = idx
                
        ## For the case of english beam
        for i in range(self.beam_size // 2, self.beam_size):
            idx = i
            prefix = tokens[idx].tolist()
            
            ## l2
            if not self.hahaha:
                self.hahaha=True
                break
            for logprob, token in zip(*logprobs[idx].topk(self.beam_size//2 + 1)):
                new_logprob = (sum_logprobs[idx] + logprob).item()
                eng_sequence = tuple(prefix + [token.item()])
                eng_scores[eng_sequence] = new_logprob
                eng_sources[eng_sequence] = idx
                    
        # STEP 2: rank the candidates and keep the top beam_size sequences for each audio
        for k in eng_scores.keys():
            assert(is_english_list(k))
        
        saved = 0
        for sequence in sorted(scores, key=scores.get, reverse=True):
            if sequence[-1]==self.eot:
                continue
            sum_logprobs[saved] = scores[sequence]
            next_tokens.append(sequence)
            source_indices.append(sources[sequence])

            saved += 1
            if saved == self.beam_size//2:
                break
            
        for eng_sequence in sorted(eng_scores, key=eng_scores.get, reverse=True):
            if eng_sequence[-1] == self.eot:
                finished[eng_sequence] = eng_scores[eng_sequence]
            else:
                sum_logprobs[saved] = eng_scores[eng_sequence]
                next_tokens.append(eng_sequence)
                source_indices.append(eng_sources[eng_sequence])

                saved += 1
                if saved == self.beam_size :
                    break

        finished_sequences.append(finished)

        tokens = torch.tensor(next_tokens, device=tokens.device)
        self.inference.rearrange_kv_cache(source_indices)

        # add newly finished sequences to self.finished_sequences
        assert len(self.finished_sequences) == len(finished_sequences)
        for previously_finished, newly_finished in zip(
            self.finished_sequences, finished_sequences
        ):
            for seq in sorted(newly_finished, key=newly_finished.get, reverse=True):
                if len(previously_finished) >= self.max_candidates:
                    break  # the candidate list is full
                previously_finished[seq] = newly_finished[seq]

        # mark as completed if all audio has enough number of samples
        completed = all(
            len(sequences) >= self.max_candidates
            for sequences in self.finished_sequences
        )
        return tokens, completed

    def finalize(self, preceding_tokens, sum_logprobs):
        # collect all finished sequences, including patience, and add unfinished ones if not enough
        """
        TODO: Add new code (or copy existing finalize code) here to implement constrained beam search
        Refer to https://github.com/openai/whisper/blob/main/whisper/decoding.py#L301-L404 for the original finalize function
        """
        sum_logprobs = sum_logprobs.cpu()
        
        for i, sequences in enumerate(self.finished_sequences):
            if (
                len(sequences) < self.beam_size//2
            ):  # when not enough sequences are finished
                for j in list(np.argsort(sum_logprobs[i][self.beam_size//2:]))[::-1]:
                    sequence = preceding_tokens[i, j].tolist() + [self.eot]
                    sequences[tuple(sequence)] = sum_logprobs[i][j].item()
                    if len(sequences) >= self.beam_size//2:
                        break

        tokens: List[List[Tensor]] = [
            [torch.tensor(seq) for seq in sequences.keys()]
            for sequences in self.finished_sequences
        ]
        sum_logprobs: List[List[float]] = [
            list(sequences.values()) for sequences in self.finished_sequences
        ]
        return tokens, sum_logprobs

class CustomDecodingTask(DecodingTask):
    """
    Updates the existing `DecodingTask` class form whisper to use constrained beam search
    """
    def __init__(self, model: "Whisper", options: DecodingOptions):
        options = replace(options, beam_size = 2 * options.beam_size if options.beam_size is not None else None)
        super().__init__(model, options)
        self.decoder = CustomBeamSearchDecoder(
            self.options.beam_size, self.tokenizer.eot, self.inference, self.options.patience
        )

@torch.no_grad()
def custom_decode(
    model: torch.nn.Module,
    mel: torch.Tensor,
    options: DecodingOptions = DecodingOptions(),
    **kwargs,
) -> Union[DecodingResult, List[DecodingResult]]:
    """
    decode function to perform constrained beam search decoding
    """
    single = mel.ndim
    if single == 2:
        mel = mel.unsqueeze(0)

    if kwargs:
        options = replace(options, **kwargs)

    result = CustomDecodingTask(model, options).run(mel)

    return result[0] if single else result


In [41]:
def inferencer(json_file_path, model, dataset, beam_size):
    """
    Perform inference on every example within `dataset` using `model` and save it in `json_file_path`.
    """

    predictions = []
    ground_truths = []
    to_json = []
    for item in tqdm.tqdm(dataset):
        ground_truth = item['transcription']
        prediction = custom_transcribe(model=model, audio = torch.tensor(item['audio']['array'].astype(np.float32)), language="hi", beam_size=beam_size)['text']
        print(prediction)
        
        ground_truths.append(ground_truth)
        predictions.append(prediction)
        
        to_json.append({
            'id': item['id'],
            'ground_truth': ground_truth,
            'prediction': prediction
        })

    cer_ = cer.compute(predictions=predictions, references=ground_truths)
    wer_ = wer.compute(predictions=predictions, references=ground_truths)

    to_json = [{'cer':cer_, 'wer': wer_}] + to_json

    # Write updated data back to JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(to_json, json_file, indent=4)

In [42]:
json_file_path = "./fine_tuned_whisper_with_constrained_beam_search.json"
inferencer(json_file_path, model, dataset['test'], beam_size=4)

  2%|▏         | 1/61 [00:06<06:37,  6.62s/it]

सुसमा सलोल कर अम्रता तीवारी अपन्ना तीवारी पंकस देस्पांडे वाशकेट बाल रेखा चौद्री लख्चमी जाता व सहीद अलीnaudible


  3%|▎         | 2/61 [00:10<04:55,  5.01s/it]

हां श्वर वाठ नौर्ट में पूचूंगी उनसे के ऐसा क्यों नो नहीं का हांnaudible


  5%|▍         | 3/61 [00:15<05:02,  5.22s/it]

हाँ मुझे office हे time न मिल पाता है। अपस में इतना काम चला भी नया project चला है तो उसके चकर में मुझे time भी पाता है।pread


  7%|▋         | 4/61 [00:23<06:00,  6.32s/it]

उठक बैठक कर रहे है और जमीन से कुछ उठा रहे हैं। याप को लबस लगा रहे हैं तो याठ मैटर थी है नाल तो वो जो कहीं चढ़डन है अगै आपने हूँphr


  8%|▊         | 5/61 [00:33<07:00,  7.51s/it]

headventer थीम भ्या आप वहापे कर सकते हैं तो वो भी बहुत अच्छा है देखने के लिए और उसके बाद मेरा जो अभी का जो plan चल रहा हैं वो पौंडी चरिका प्लेन�


 10%|▉         | 6/61 [00:49<09:22, 10.22s/it]

फिर हमने एक राम अंदिर है वहाँ पे भी गये थे वहाँ पे भी अच्छा मंदिर हैं औड़ो कीचने के लिए देखने के लिए शान पेलेस हैं अच्छी जगहलागी है वो भी हमने चार पाच जगह है वहाँ पे अम्रसर में visit कीfal वी जिट करने के बाद हम्से�


 11%|█▏        | 7/61 [00:53<07:29,  8.33s/it]

और जो दिली free कर रहे हैं शूज अगर वो रही सरी के सारेलेवंडishूज फैव कर रहे हैं अःःँःँःँग


 13%|█▎        | 8/61 [01:04<08:13,  9.31s/it]

वारकिये दर्वाई है तो सब लोग market से लेकरे खालते हैं क्या कितना time न आप वहां आप बादत करने गया वो तो आप करोगे वहाँ फिर होतेल हैं से आपका मदीने जाओगे फिर वहां आप time pass पेन करोगे। वहाँarrator


 15%|█▍        | 9/61 [01:13<07:58,  9.19s/it]

हाँ उस्घी जो सुम चंधा में जो आरसत की महार है नहती है क्या तो अच्छी अटी करी मैं मैं कुछ लगने मैं मिलू है उनको एक हाक दूं अच्छी से कास की घुम पसर कर करarrator


 16%|█▋        | 10/61 [01:25<08:28,  9.98s/it]

तो था l अपनेप्टे प्रेप्पेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेपेquels


 18%|█▊        | 11/61 [01:30<06:57,  8.35s/it]

हाँ हाँ दिल तो पागल है picture का नाम है आप चाहों तो देखो देखो में को याआँ आ गया नाम larrator


 20%|█▉        | 12/61 [01:40<07:16,  8.91s/it]

 वहाँ पाए जाने के बास एक कमतलब अजी वजैसे वो होता है जैसे कि कहाम हाँ गय कितने वड़े सक्से हमें किसी मतलब आजा के सम्राज में आगया ऐसा मतलब फेलेंग आती है कि कि समवाज जम मेंarrator


 21%|██▏       | 13/61 [01:53<08:08, 10.18s/it]

और अगर कोई और चीजो की जरुध है जैसे कुछ चाइए है आपको खाना बनाई ये वो दूसरा आपको boat के अंदर कुछ घख दिख रही हैं boat के अंदर गगु काम है तो एक boat के साथ एक आपनी होता है जो boat की साब सपायराक रखा का काम दरता है।uetooth


 23%|██▎       | 14/61 [02:01<07:32,  9.63s/it]

मैं यागा उसक अच्छिन pilot को ही लोग सा रही थे की वो ज्युवा नेता है अच्छा काम करेंगे ज्युवा है मतलब गराम पून है तो उस से अपसे काम करेंगेarrator


 25%|██▍       | 15/61 [02:08<06:41,  8.73s/it]

जाँडी लिवा वो भी अच्छा वो क्या ना में उसका वो जो चुप के चुपके movie में था राजपार्यादा वो बहुत ही Comedy करता है।iffany


 26%|██▌       | 16/61 [02:14<05:51,  7.81s/it]

 कया किं ये है भो पाल के प्राणे खरे घुर सवारी ये रहने वाले भो पाल के अनाम बासित shooting में ये भो पाल केarrator


 28%|██▊       | 17/61 [02:21<05:37,  7.67s/it]

उसमे उज्रुक आद्मी भी है बच्चे भी है औरager अपनी ठी नीजर भी है अपने को समझ में आ रहा है क्या उनके लाय्तना कैसर रहे है वो�


 30%|██▉       | 18/61 [02:29<05:34,  7.79s/it]

सही हाँ सही हाँ पारे लेकिन हमारे यहाँ भी कुछ university भी अच्छी है l पारे लिए किन हमारे यहाँ भी कुछ university भी अच्छी है l पारे लिए कुचाँ कुचाँarrator


 31%|███       | 19/61 [02:34<04:55,  7.04s/it]

वैसें private में हम private बांको में भी लग सकते हैं वहाँ पे interview देखर या कुछ भी करकर हम private बांक में भी लग सकते हैं।arrator



 33%|███▎      | 20/61 [02:42<05:33,  8.13s/it]


KeyboardInterrupt: 

In [None]:
!head -n 5 ./fine_tuned_whisper_with_constrained_beam_search.json