In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [2]:
from huggingface_hub import login

In [3]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from datasets import load_dataset

ds = load_dataset("iTzMiNOS/voice-orders-small-clean-12k")

In [23]:
ds

DatasetDict({
    train: Dataset({
        features: ['transcribed_text', 'speaker', 'items', 'count', 'audio'],
        num_rows: 12000
    })
})

In [24]:
ds["train"][0]

{'transcribed_text': "I'll have a Cream of Mushroom with truffle oil and chopped parsley, a Greek salad without protein, add grilled chicken and falafel, a vegan burger with mustard, ketchup, no cheese, vegan cheese, and vegan mayo, plus extra cheese on the side.",
 'speaker': 'af_bella',
 'items': [{'customizations': ['Truffle Oil', 'Chopped Parsley'],
   'name': 'Cream of Mushroom'},
  {'customizations': ['No Protein', 'Grilled Chicken', 'Falafel'],
   'name': 'Greek Salad'},
  {'customizations': ['Mustard',
    'Ketchup',
    'No Cheese',
    'Vegan Cheese',
    'Vegan Mayo'],
   'name': 'Vegan Burger'},
  {'customizations': [], 'name': 'Extra Cheese'}],
 'count': 4,
 'audio': {'path': 'response_0.wav',
  'array': array([-3.05175781e-05,  0.00000000e+00, -3.05175781e-05, ...,
         -3.05175781e-05,  0.00000000e+00,  0.00000000e+00]),
  'sampling_rate': 24000}}

In [25]:
!pip install torchaudio accelerate jiwer



In [26]:
from datasets import Audio

dataset = ds.remove_columns([col for col in ds["train"].column_names if col not in ["audio", "speaker", "transcribed_text"]])
dataset = dataset.rename_column("transcribed_text", "text")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'speaker', 'audio'],
        num_rows: 12000
    })
})

In [27]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

processor.tokenizer.set_prefix_tokens(language="english", task="transcribe")

In [11]:
def prepare_batch(batch):
    audio = batch["audio"]

    inputs = processor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    )

    labels = processor.tokenizer(
        batch["text"],
        return_tensors="pt",
        padding="longest",
        truncation=True
    ).input_ids

    batch["input_features"] = inputs.input_features[0]
    batch["labels"] = labels[0]
    return batch

dataset = dataset.map(prepare_batch, remove_columns=dataset["train"].column_names, num_proc=1)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [28]:
# Assume 'dataset' is your initial DatasetDict, e.g., loaded via load_dataset
# Assume it contains a 'train' split like dataset['train']
# Assume 'processor' is already loaded (WhisperProcessor)
from datasets import DatasetDict

# --- Configuration ---
SPEAKER_COLUMN_NAME = "speaker"  # <<<--- IMPORTANT: Replace "speaker" with the actual name of your speaker column!
TEST_SET_SIZE = 0.2
RANDOM_SEED = 42
NUM_PROCESSING_CORES = 4 # Adjust based on your machine

# --- Step 1: Perform Stratified Split FIRST ---
print(f"Starting stratified split using column: '{SPEAKER_COLUMN_NAME}'...")
try:
    # Ensure the dataset is a DatasetDict and has a 'train' key
    if not isinstance(dataset, DatasetDict) or 'train' not in dataset:
        raise TypeError("Expected 'dataset' to be a DatasetDict with a 'train' split.")

    # Ensure the speaker column exists
    if SPEAKER_COLUMN_NAME not in dataset['train'].column_names:
        raise ValueError(
            f"Speaker column '{SPEAKER_COLUMN_NAME}' not found in dataset['train']. "
            f"Available columns: {dataset['train'].column_names}"
        )

    # Perform the split on the raw training data
    split_dataset = dataset['train'].train_test_split(
        test_size=TEST_SET_SIZE,
        seed=RANDOM_SEED,
        stratify_by_column=SPEAKER_COLUMN_NAME # Stratify using your speaker column
    )

    print("Stratified split successful.")
    print(f"  New Training set size: {len(split_dataset['train'])}")
    print(f"  New Test set size: {len(split_dataset['test'])}")

except Exception as e:
    print(f"Error during stratified split: {e}")
    # Handle the error appropriately, maybe exit or fall back to random split
    split_dataset = None # Or however you want to signal failure

# --- Step 2: Define your prepare_batch function (exactly as you provided) ---
def prepare_batch(batch):
    # Process audio - make sure 'audio' column contains {'array': ..., 'sampling_rate': ...}
    # Add error handling if your audio isn't pre-loaded this way
    try:
        audio = batch["audio"]
        if not isinstance(audio, dict) or "array" not in audio or "sampling_rate" not in audio:
             # If audio needs loading from path, do it here. Example placeholder:
             # raise ValueError(f"Audio data structure incorrect: {audio}")
             pass # Assuming format is correct for now

        inputs = processor(
            audio["array"],
            sampling_rate=audio["sampling_rate"],
            return_tensors="pt"
        )
    except Exception as e:
        print(f"Error processing audio in batch: {e}")
        # Decide how to handle: skip batch item, return None, raise error?
        # For now, let's add placeholders and return the batch potentially unmodified or partially processed
        batch["input_features"] = None # Indicate failure
        return batch


    # Process text - make sure 'text' is your transcript column name
    try:
        labels = processor.tokenizer(
            batch["text"], # Replace "text" if your transcript column is named differently
            return_tensors="pt",
            padding="longest", # Pad within the batch
            truncation=True,   # Truncate long sequences
            # max_length=processor.tokenizer.model_max_length # Optional: Explicit max length
        ).input_ids
    except Exception as e:
        print(f"Error processing text in batch: {e}")
        batch["labels"] = None # Indicate failure
        return batch

    # Assign processed features, removing the outer batch dimension added by processor/tokenizer
    batch["input_features"] = inputs.input_features[0] if inputs.input_features is not None else None
    batch["labels"] = labels[0] if labels is not None else None
    return batch


# --- Step 3: Apply prepare_batch to the *new* train and test splits ---
if split_dataset: # Only proceed if the split was successful
    print(f"\nApplying processing and removing columns from the new 'train' split...")
    # Important: Get column names *from the split* before mapping/removing them
    train_columns_to_remove = split_dataset['train'].column_names
    split_dataset['train'] = split_dataset['train'].map(
        prepare_batch,
        remove_columns=train_columns_to_remove,
        num_proc=NUM_PROCESSING_CORES
    )
    # Optional: Filter out samples where processing failed
    split_dataset['train'] = split_dataset['train'].filter(lambda example: example['input_features'] is not None and example['labels'] is not None)


    print(f"Applying processing and removing columns from the new 'test' split...")
    test_columns_to_remove = split_dataset['test'].column_names
    split_dataset['test'] = split_dataset['test'].map(
        prepare_batch,
        remove_columns=test_columns_to_remove,
        num_proc=NUM_PROCESSING_CORES
    )
    # Optional: Filter out samples where processing failed
    split_dataset['test'] = split_dataset['test'].filter(lambda example: example['input_features'] is not None and example['labels'] is not None)

    print("\nProcessing complete.")
    print("Final columns in train split:", split_dataset['train'].column_names)
    print("Final columns in test split:", split_dataset['test'].column_names)
    print("Example processed train sample:", split_dataset['train'][0])

else:
    print("\nSkipping processing because stratified split failed.")

Starting stratified split using column: 'speaker'...
Error during stratified split: Stratifying by column is only supported for ClassLabel column, and column speaker is Value.

Skipping processing because stratified split failed.


In [12]:
from dataclasses import dataclass
import torch
from typing import Any

@dataclass
class DataCollatorSpeechSeq2Seq:
    processor: Any

    def __call__(self, features):
        # Convert to tensor if not already
        input_features = torch.stack([
            torch.tensor(f["input_features"]) for f in features
        ])

        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(f["labels"]) for f in features],
            batch_first=True,
            padding_value=self.processor.tokenizer.pad_token_id
        )
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        return {"input_features": input_features, "labels": labels}

In [18]:
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2400
    })
})

In [14]:
!pip install tensorboard
!pip install wandb



In [15]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

''' training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    warmup_steps=500,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    eval_strategy="no",
    fp16=False,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=DataCollatorSpeechSeq2Seq(processor),
)


trainer.train() '''

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    eval_strategy="no",
    fp16=True,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=DataCollatorSpeechSeq2Seq(processor),
)

trainer.train()


  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,1.3754
200,0.3233
300,0.0981
400,0.0379
500,0.0331
600,0.0347
700,0.0319
800,0.0223
900,0.019
1000,0.0186




KeyboardInterrupt: 

In [17]:
import evaluate # Import the evaluate library
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset # Assuming you load your dataset like this
from transformers.models.whisper.english_normalizer import BasicTextNormalizer # Optional: For text normalization
from dataclasses import dataclass # For the data collator
from typing import Any, Dict, List, Union # For the data collator
import torch # For the data collator

# --- Assume these are already loaded and prepared ---
# model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
# processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="<language>", task="transcribe")
# split_dataset = load_dataset(...) # Your preprocessed train/test splits
# -----------------------------------------------------

# --- Step 1: Load the WER metric ---
wer_metric = evaluate.load("wer")
# Optional: Load CER metric as well if needed
cer_metric = evaluate.load("cer")

# Optional: Normalizer for consistent text comparison (removes punctuation, lowers case)
# Adjust normalization based on your specific needs and dataset characteristics
normalizer = BasicTextNormalizer()

# --- Step 2: Define the compute_metrics function ---
def compute_metrics(pred):
    # pred object contains predicted `predictions` and ground truth `label_ids`
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 (ignore index) with pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True, normalize=True) # Use processor's normalization if desired
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True, normalize=True) # Use processor's normalization if desired

    # Optional: Apply additional normalization if needed (e.g., BasicTextNormalizer)
    # pred_str = [normalizer(s) for s in pred_str]
    # label_str = [normalizer(s) for s in label_str]

    # Calculate WER
    # Note: Filter out empty labels, which can cause issues with WER calculation
    # This might happen if an audio file is silent or fails preprocessing.
    filtered_pred_str = []
    filtered_label_str = []
    for pred_s, label_s in zip(pred_str, label_str):
        if len(label_s) > 0: # Only include pairs with non-empty labels
             filtered_pred_str.append(pred_s)
             filtered_label_str.append(label_s)

    if not filtered_label_str: # Handle case where all labels might be empty after filtering
        print("Warning: All labels were empty after filtering. Returning WER = 1.0")
        wer = 1.0
    else:
        wer = wer_metric.compute(predictions=filtered_pred_str, references=filtered_label_str)

    # Optional: Calculate CER
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    # Return metrics as a dictionary
    return {"wer": wer, "cer": cer}
    return {"wer": wer}

# --- Step 3: Modify Training Arguments ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-finetuned",
    per_device_train_batch_size=4, # Keep small for large models if memory is limited
    gradient_accumulation_steps=2,  # Effective batch size = 4 * 2 = 8
    learning_rate=1e-5,
    warmup_steps=500,
    num_train_epochs=3, # Or use max_steps for very large datasets
    # max_steps=4000, # Alternative to num_train_epochs

    # --- Evaluation Arguments ---
    eval_strategy="steps",      # Evaluate every `eval_steps`
    eval_steps=500,                   # Evaluate every 500 steps (align with save_steps if desired)

    # --- Logging Arguments ---
    logging_steps=100,                # Log metrics every 100 steps
    report_to=["tensorboard"],      # Log to TensorBoard (or ["wandb"] or ["tensorboard", "wandb"])

    # --- Saving Arguments ---
    save_steps=500,                   # Save checkpoint every 500 steps
    save_total_limit=2,               # Keep only the last 2 checkpoints
    load_best_model_at_end=True,      # Load the best model (based on metric) when training finishes
    metric_for_best_model="wer",      # Metric to determine the "best" model
    greater_is_better=False,          # Lower WER is better

    # --- Other Arguments ---
    fp16=True,                        # Use mixed precision (requires compatible GPU and CUDA setup)
    # push_to_hub=False,              # Set to True if you want to push to Hugging Face Hub
    # predict_with_generate=True,     # Necessary for Seq2Seq models during evaluation
)

# --- Step 4: Define the Data Collator ---
# (Ensure you have this defined correctly for your data)
@dataclass
class DataCollatorSpeechSeq2Seq:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have to be of different lengths
        # Process audio features
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Process labels
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # Handle cases where decoder_input_ids are needed (not standard for Whisper fine-tuning with Trainer)
        # If the model has decoder_input_ids generation enabled (e.g., during generation),
        # it might be handled internally by the trainer/model. For standard Whisper training,
        # the labels are shifted internally to create decoder_input_ids.
        if (
            labels.shape[-1] == batch["input_features"].shape[-1]
            and hasattr(self.processor.feature_extractor, "config")
            and hasattr(self.processor.feature_extractor.config, "max_source_positions")
            and batch["input_features"].shape[-1] > self.processor.feature_extractor.config.max_source_positions
        ):
             # This condition might be specific to certain older examples/models,
             # Whisper generally handles the input length difference well.
             print("Warning: Input feature length might exceed max source positions.")
             pass # Or handle truncation if necessary


        batch["labels"] = labels

        return batch

# Instantiate the data collator
data_collator = DataCollatorSpeechSeq2Seq(processor=processor)


# --- Step 5: Initialize the Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"], # Use your test or validation split here
    tokenizer=processor.feature_extractor, # Pass feature_extractor for processing inputs
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Pass the metrics function
)

# --- Step 6: Run Training ---
trainer.train()

# --- Step 7: (Optional) Evaluate after training ---
# If you used load_best_model_at_end=True, the trainer.model is the best one.
results = trainer.evaluate()
print("Final Evaluation Results:", results)

# --- Step 8: Save the final best model & processor ---
trainer.save_model("./whisper-small-finetuned/best_model")
processor.save_pretrained("./whisper-small-finetuned/best_model")

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
test_model = model.from_pretrained("./whisper-small-finetuned/checkpoint-1500")

trainer = Seq2SeqTrainer(
    model=test_model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=DataCollatorSpeechSeq2Seq(processor),
)

evaluation_results = trainer.evaluate(eval_dataset=split_dataset["test"])

print("Evaluation results:", evaluation_results)

  trainer = Seq2SeqTrainer(


Evaluation results: {'eval_loss': 0.0076557970605790615, 'eval_model_preparation_time': 0.0059, 'eval_runtime': 193.4273, 'eval_samples_per_second': 6.204, 'eval_steps_per_second': 0.775}


In [None]:
test_model = model.from_pretrained("./whisper-small-finetuned/checkpoint-1000")

trainer = Seq2SeqTrainer(
    model=test_model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=DataCollatorSpeechSeq2Seq(processor),
)



evaluation_results = trainer.evaluate(eval_dataset=split_dataset["test"])

print("Evaluation results:", evaluation_results)

  trainer = Seq2SeqTrainer(


Evaluation results: {'eval_loss': 0.01593630760908127, 'eval_model_preparation_time': 0.006, 'eval_runtime': 193.2076, 'eval_samples_per_second': 6.211, 'eval_steps_per_second': 0.776}


In [None]:
from transformers import GenerationConfig

# Create a NEW GenerationConfig without forced_decoder_ids
new_config = GenerationConfig.from_dict(model.generation_config.to_dict())
new_config.forced_decoder_ids = None  # Explicitly remove
new_config._from_model_config = False  # Important for newer versions

# Apply the new config
model.generation_config = new_config

# Verify removal
print("Updated generation config:", model.generation_config)

# Save everything
trainer.save_model("./whisper-small-finetuned")
processor.save_pretrained("./whisper-small-finetuned")

# Verify the saved model
from transformers import WhisperForConditionalGeneration
loaded_model = WhisperForConditionalGeneration.from_pretrained("./whisper-small-finetuned")
loaded_model.generation_config = new_config
print("Loaded model config:", loaded_model.generation_config)

Updated generation config: GenerationConfig {
  "alignment_heads": [
    [
      2,
      2
    ],
    [
      3,
      0
    ],
    [
      3,
      2
    ],
    [
      3,
      3
    ],
    [
      3,
      4
    ],
    [
      3,
      5
    ]
  ],
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "decoder_start_token_id": 50258,
  "eos_token_id": 50257,
  "is_multilingual": true,
  "lang_to_id": {
    "<|af|>": 50327,
    "<|am|>": 50334,
    "<|ar|>": 50272,
    "<|as|>": 50350,
    "<|az|>": 50304,
    "<|ba|>": 50355,
    "<|be|>": 50330,
    "<|bg|>": 50292,
    "<|bn|>": 50302,
    "<|bo|>": 50347,
    "<|br|>": 50309,
    "<|bs|>": 50315,
    "<|ca|>": 50270,
    "<|cs|>": 50283,
    "<|cy|>": 50297,
    "<|da|>": 50285,
    "<|de|>": 50261,
    "<|el|>": 50281,
    "<|en|>": 50259,
    "<|es|>": 50262,
    "<|et|>": 50307,
    "<|eu|>": 50310,
    "<|fa|>": 50300,
    "<|fi|>": 50277,
    "<|fo|>": 50338,
    "<|fr|>": 50265,
    "<|gl|>": 5031

In [None]:
import shutil

shutil.make_archive('./whisper-small-finetuned-tokenizer', 'zip', './whisper-small-finetuned')

'/content/whisper-small-finetuned-tokenizer.zip'

In [None]:
from google.colab import files

files.download('./whisper-small-finetuned-1000.zip')
files.download('./whisper-small-finetuned-1500.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import shutil

# Define the directory you want to delete
directory = './whisper-small-finetuned/checkpoint-1500'

# Remove all files inside the directory first
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    # If it's a file, remove it
    if os.path.isfile(file_path):
        os.remove(file_path)

    # If it's a directory, remove it recursively
    elif os.path.isdir(file_path):
        shutil.rmtree(file_path)

# Finally, remove the empty directory
os.rmdir(directory)


In [None]:
from huggingface_hub import HfApi

repo_name = "iTzMiNOS/whisper-small-finetuned"

api = HfApi()

api.upload_folder(
    folder_path="./whisper-small-finetuned/checkpoint-1500",
    repo_id=repo_name,
    repo_type="model",
)

optimizer.pt:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

scaler.pt:   0%|          | 0.00/988 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/iTzMiNOS/whisper-small-finetuned/commit/63169d41b7c70d7b7183fb2ab5e0ace9a75bae54', commit_message='Upload folder using huggingface_hub', commit_description='', oid='63169d41b7c70d7b7183fb2ab5e0ace9a75bae54', pr_url=None, repo_url=RepoUrl('https://huggingface.co/iTzMiNOS/whisper-small-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='iTzMiNOS/whisper-small-finetuned'), pr_revision=None, pr_num=None)

In [None]:
from IPython.display import display, Javascript
import torch
import librosa
from google.colab import files
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# 1. Record Audio in Colab
def record_audio():
    display(Javascript('''
        async function record() {
            var stream = await navigator.mediaDevices.getUserMedia({audio: true});
            var recorder = new MediaRecorder(stream);
            var audioChunks = [];
            recorder.ondataavailable = event => audioChunks.push(event.data);
            recorder.onstop = () => {
                var audioBlob = new Blob(audioChunks);
                var audioUrl = URL.createObjectURL(audioBlob);
                var audio = new Audio(audioUrl);
                audio.controls = true;
                document.body.appendChild(audio);

                // Save the audio file in a downloadable format
                var link = document.createElement("a");
                link.href = audioUrl;
                link.download = "recorded_audio.wav";  // Name the file here
                document.body.appendChild(link);
                link.click();  // Automatically click the link to trigger download
            };
            recorder.start();
            setTimeout(() => recorder.stop(), 15000);  // Record for 10 seconds
        }
        record();
    '''))

# 2. Transcribe Recorded Audio (Once File is Downloaded)
def transcribe_audio(audio_path):
    # Load the audio file
    audio, sampling_rate = librosa.load(audio_path, sr=16000)

    # Prepare the audio for the model
    inputs = processor(audio, sampling_rate=sampling_rate, return_tensors="pt")

    # Generate transcription
    with torch.no_grad():
        generated_ids = model.generate(inputs.input_features)

    # Decode the generated ids to text
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return transcription

In [None]:
record_audio()

<IPython.core.display.Javascript object>

In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small")

# Save the tokenizer
processor.save_pretrained("./whisper-small-finetuned")

[]

"I'll take some spicy coleslaw, mozzarella sticks with marinara and ranch, onion rings with ranch and BBQ, grilled tofu with peanut sauce, soy ginger, and chili lime, orange fresh juice, a selection of rose, white, and red wine, and extra whipped cream on the side."

In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor, GenerationConfig
import librosa
import torch

new_config = GenerationConfig.from_dict(model.generation_config.to_dict())
new_config.forced_decoder_ids = None
new_config._from_model_config = False

processor = WhisperProcessor.from_pretrained("itzminos/whisper-small-finetuned")

mymodel = WhisperForConditionalGeneration.from_pretrained("itzminos/whisper-small-finetuned")
mymodel.generation_config = new_config

audio_path = "/content/recaudio.wav"
audio_array, sampling_rate = librosa.load(audio_path, sr=16000)

inputs = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features.to("cuda:0")

outputs = model.generate(inputs)
transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]

print("Transcription:", transcription)

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

  audio_array, sampling_rate = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcription: I'll take some spicy coleslaw, mozzarella sticks with marinara and ranch, onion rings with ranch and BBQ, grilled tofu with peanut sauce, soy ginger, and chili lime, orange fresh juice, a selection of rose, white, and red wine, and extra whipped cream on the side. ________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________

In [None]:
text = "I'll have a Garden Salad with vinaigrette and balsamic, mozzarella sticks with ranch and marinara, crispy fish and chips, a side of strawberries, extra dipping sauces with blue cheese and ranch, extra sauces with gravy, BBQ sauce, and pesto, and some extra toppings with nuts and berries."

In [None]:
text

"I'll have a Garden Salad with vinaigrette and balsamic, mozzarella sticks with ranch and marinara, crispy fish and chips, a side of strawberries, extra dipping sauces with blue cheese and ranch, extra sauces with gravy, BBQ sauce, and pesto, and some extra toppings with nuts and berries."

In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.27.1 python-Levenshtein-0.27.1


In [None]:
import Levenshtein

str1 = "kitten"
str2 = "sitting"

# Calculate Levenshtein distance
distance = Levenshtein.distance(text, transcribe)
print(f"Levenshtein distance: {distance}")  # Output: 3

# Calculate similarity ratio (0.0 to 1.0)
similarity_ratio = Levenshtein.ratio(text, transcribe)
print(f"Similarity ratio: {similarity_ratio:.2f}")  # Output: 0.62

Levenshtein distance: 6
Similarity ratio: 0.98


In [None]:
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
import torch

processor = WhisperProcessor.from_pretrained("itzminos/whisper-tiny-finetuned-basic")

new_config = GenerationConfig.from_dict(model.generation_config.to_dict())
new_config.forced_decoder_ids = None
new_config._from_model_config = False

processor = WhisperProcessor.from_pretrained("itzminos/whisper-tiny-finetuned-basic")

model = WhisperForConditionalGeneration.from_pretrained("itzminos/whisper-tiny-finetuned-basic")
model.generation_config = new_config

transcriber = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
)

audio_path = "response_3001.wav"
result = transcriber(audio_path)
print("Transcription:", result["text"])

Device set to use cuda:0


Transcription:  Can I get an extra scoop of ice cream, a vegan burger with no cheese, vegan mayo, and mustard, an apple colt, a Greek salad with falafel, oil turkey with cheddar cheese, a fruit salad, a garden salad with vinaigrette ranch and balsamic, spring rolls with vegetarian and chicken, extra BBQ sauce and pasta, and a cheesecake?


In [None]:
text = "I'll take some spicy coleslaw, mozzarella sticks with marinara and ranch, onion rings with ranch and BBQ, grilled tofu with peanut sauce, soy ginger, and chili lime, orange fresh juice, a selection of rose, white, and red wine, and extra whipped cream on the side."
trans = "I'll take some spicy coleslaw, mozzarella sticks with marinara and ranch, onion rings with ranch and BBQ, grilled tofu with peanut sauce, soy ginger, and chili lime, orange fresh juice, a selection of rose, white, and red wine, and extra whipped cream on the side."
from jiwer import wer

error_rate = wer(text.lower(), trans.lower())
accuracy = (1 - error_rate) * 100
print(f"WER: {error_rate:.2f}, Accuracy: {accuracy:.2f}%")

WER: 0.00, Accuracy: 100.00%
