In [49]:
## RUN THIS BEFORE ANYTHING
# Create a virtual environment named 'synth_env'
!apt install python3.10-venv
!python -m venv synth_env

# Activate the virtual environment and install necessary libraries
!source synth_env/bin/activate && pip install transformers datasets audioread accelerate peft torchaudio scikit-learn evaluate jiwer

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3.10-venv is already the newest version (3.10.12-1~22.04.7).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Collecting transformers
  Using cached transformers-4.46.3-py3-none-any.whl (10.0 MB)
Collecting tokenizers<0.21,>=0.20
  Using cached tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
Successfully installed tokenizers-0.20.3 transformers-4.46.3


In [38]:
## Check necessary libraries
!source synth_env/bin/activate && pip list

Package                  Version
------------------------ -----------
accelerate               1.1.1
aiohappyeyeballs         2.4.4
aiohttp                  3.11.9
aiosignal                1.3.1
async-timeout            5.0.1
attrs                    24.2.0
audioread                3.0.1
certifi                  2024.8.30
charset-normalizer       3.4.0
click                    8.1.7
datasets                 3.1.0
dill                     0.3.8
evaluate                 0.4.3
filelock                 3.16.1
frozenlist               1.5.0
fsspec                   2024.9.0
huggingface-hub          0.26.3
idna                     3.10
Jinja2                   3.1.4
jiwer                    3.0.5
joblib                   1.4.2
MarkupSafe               3.0.2
mpmath                   1.3.0
multidict                6.1.0
multiprocess             0.70.16
networkx                 3.4.2
numpy                    2.1.3
nvidia-cublas-cu12       12.4.5.8
nvidia-cuda-cupti-cu12   12.4.127
nvidia-cuda-n

In [3]:
# Mount Google Drive (DO THIS BEFORE NEXT STEP)
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
## Create CSV file for FINE TUNING

import os
import csv

# Define the directory containing the dataset in Google Drive
dataset_dir = "/content/drive/MyDrive/whisper_synth_files/data/dataset1"

# Create a list to store the data rows
data_rows = []

# Loop through all files in the dataset directory
for filename in os.listdir(dataset_dir):
    if filename.endswith(".wav"):
        # Extract the base name (e.g., 'sound1')
        base_name = os.path.splitext(filename)[0]
        text_filename = base_name + ".txt"
        text_filepath = os.path.join(dataset_dir, text_filename)
        if os.path.exists(text_filepath):
            audio_filepath = os.path.join(dataset_dir, filename)
            with open(text_filepath, "r") as text_file:
                text_content = text_file.read().strip()

            # Add the audio path and text content to the data rows
            data_rows.append([audio_filepath, text_content])

# Define the CSV file path
csv_file_path = os.path.join(dataset_dir, "train.csv")

# Write the data rows to the CSV file
with open(csv_file_path, "w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["audio", "text"])
    csv_writer.writerows(data_rows)

print(f"CSV file 'train.csv' has been created at: {csv_file_path}")

CSV file 'train.csv' has been created at: /content/drive/MyDrive/whisper_synth_files/data/dataset1/train.csv


In [39]:
# Activate the virtual environment and view model architechture
!source synth_env/bin/activate && python -c "from transformers import WhisperForConditionalGeneration; model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small'); print(model)"

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias

In [25]:
## SCRIPT FOR ADDING SPECIAL TOKENS add_tokens.py

with open("add_tokens.py", "w") as f:
    f.write("""
from transformers import WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration

# Load the base tokenizer and model
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# Define special tokens
special_tokens = [
    # Tokens without leading spaces
    "Waveform:", "Voices:", "Oscillator Detune:", "Filter Type:", "Filter Cutoff:",
    "ADSR Envelope:", "Attack:", "Decay:", "Sustain:", "Release:",
    "LFO Modulation:", "Hz", "ms", "dB", "s", ",", "\\n", "-", ".", "None",
    # Tokens with leading spaces
    " Waveform:", " Voices:", " Oscillator Detune:", " Filter Type:", " Filter Cutoff:",
    " ADSR Envelope:", " Attack:", " Decay:", " Sustain:", " Release:",
    " LFO Modulation:", " Hz", " ms", " dB", " s", " None", " -"
]

# Add special tokens to the tokenizer
num_added_toks = tokenizer.add_tokens(special_tokens)
print(f"Added {num_added_toks} tokens")

# Resize the model's embeddings to accommodate new tokens
model.resize_token_embeddings(len(tokenizer))

# Create a processor with the updated tokenizer
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
    language="en",
    task="transcribe",
    tokenizer=tokenizer
)

# Save the tokenizer and processor
tokenizer.save_pretrained("/content/drive/MyDrive/whisper_synth_files/whisper_tokenizer_with_special_tokens")
processor.save_pretrained("/content/drive/MyDrive/whisper_synth_files/whisper_processor_with_special_tokens")
""")

In [26]:
# Activate the environment and run the script
!source synth_env/bin/activate && python add_tokens.py

Added 31 tokens
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [40]:
# Tokenizer test
with open("tokenizer_test.py", "w") as f:
    f.write("""
from transformers import WhisperTokenizer

# Load the tokenizer with added special tokens
tokenizer = WhisperTokenizer.from_pretrained("/content/drive/MyDrive/whisper_synth_files/whisper_tokenizer_with_special_tokens")

# Test text
test_text = "Sustain: -12.3 dB"

# Tokenize the test text
tokens = tokenizer.tokenize(test_text)
print("Tokens:", tokens)

# Optionally, print token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)
""")

!source synth_env/bin/activate && python tokenizer_test.py

Tokens: ['Sustain:', '-', '12', '.', '3', ' dB']
Token IDs: [51873, 12, 4762, 13, 18, 51892]


In [68]:
## SCRIPT FOR FINE TUNING (USING LORA) finetune_whisper.py

# Create a script to perform the entire fine-tuning
with open("finetune_whisper.py", "w") as f:
    f.write("""
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    WhisperTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import get_peft_model, LoraConfig
import torchaudio
from sklearn.model_selection import train_test_split
import pandas as pd
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Define paths
data_path = '/content/drive/MyDrive/whisper_synth_files/data/dataset1/'

# Load dataset
data_files = {'train': data_path + 'train.csv'}
dataset = load_dataset('csv', data_files=data_files)

# Split the dataset into training and evaluation sets
train_data, eval_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)

# Convert train and eval data back to Dataset objects
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))

# Load the updated tokenizer
tokenizer = WhisperTokenizer.from_pretrained("/content/drive/MyDrive/whisper_synth_files/whisper_tokenizer_with_special_tokens")

# Load the processor with the updated tokenizer
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
    language="en",
    task="transcribe",
    tokenizer=tokenizer
)

# Load the model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

# Resize the model's embeddings to accommodate new tokens
model.resize_token_embeddings(len(tokenizer))

def preprocess_function(examples):
    audio_path = examples['audio']
    audio_array, sampling_rate = torchaudio.load(audio_path)

    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        audio_array = resampler(audio_array)

    audio_array = audio_array.squeeze().numpy()

    # Extract input features
    input_features = processor.feature_extractor(
        audio_array, sampling_rate=16000
    ).input_features[0]

    # Tokenize target text
    text = examples['text']
    labels = processor.tokenizer(
        text
    ).input_ids

    # Return a dictionary with the correct keys
    return {
        "input_features": input_features,
        "labels": labels
    }

# Preprocess datasets
train_dataset = train_dataset.map(preprocess_function, remove_columns=['audio', 'text'])
eval_dataset = eval_dataset.map(preprocess_function, remove_columns=['audio', 'text'])

# Apply LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, peft_config)

# Define custom data collator
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Separate input_features and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        labels = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input_features using the feature extractor
        batch = self.processor.feature_extractor.pad(
            input_features,
            return_tensors="pt"
        )

        # Pad labels using the tokenizer
        labels_batch = self.processor.tokenizer.pad(
            labels,
            padding=True,
            return_tensors="pt"
        )

        # Replace padding token id's of the labels by -100
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # Remove the decoder_start_token_id
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

# Initialize the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/whisper_synth_files/whisper_finetuned",
    per_device_train_batch_size=4,
    learning_rate=3e-5,
    num_train_epochs=10,
    logging_dir="/content/drive/MyDrive/whisper_synth_files/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_total_limit=2,
    save_steps=500,
    eval_strategy="epoch",
    eval_steps=500,
    predict_with_generate=True,
    fp16=True,
)

# Define custom metric
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Replace -100 with pad_token_id
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id

    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = processor.tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Calculate exact match accuracy
    exact_matches = [int(p.strip() == l.strip()) for p, l in zip(pred_str, labels_str)]
    accuracy = sum(exact_matches) / len(exact_matches)

    return {"accuracy": accuracy}

# Define Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=processor,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Start training
trainer.train()
""")

In [69]:
# Activate the environment and run the script
!source synth_env/bin/activate && python finetune_whisper.py

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Map: 100% 160/160 [00:04<00:00, 37.49 examples/s]
Map: 100% 40/40 [00:01<00:00, 37.50 examples/s]
  0% 0/400 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
{'loss': 10.2973, 'grad_norm': 11.144461631774902, 'learning_rate': 2.9325e-05, 'epoch': 0.25}
{'loss': 9.275, 'grad_norm': 7.985929012298584, 'learning_rate': 2.8575e-05, 'epoch': 0.5}
{'loss': 8.6888, 'grad_norm': 9.840339660644531, 'learning_rate': 2.805e-05, 'epoch': 0.75}
{'loss': 8.2177, 'grad_norm': 4.99782133102417, 'learning_rate': 2.7300000000000003e-05, 'epoch': 1.0}
 10

In [75]:
 # Create a script to combine with base model
with open("combine_models.py", "w") as f:
    f.write("""
import os
import torch
from transformers import WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor
from peft import PeftModel

# Load the tokenizer used during fine-tuning
tokenizer = WhisperTokenizer.from_pretrained("/content/drive/MyDrive/whisper_synth_files/whisper_tokenizer_with_special_tokens")

# Load the base model and resize embeddings
base_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
base_model.resize_token_embeddings(len(tokenizer))

# Load the PeftModel with the adapters
adapter_checkpoint = "/content/drive/MyDrive/whisper_synth_files/whisper_finetuned/checkpoint-400"

model = PeftModel.from_pretrained(
    base_model,
    adapter_checkpoint
)

# Merge adapter weights with the base model
model = model.merge_and_unload()

# Save the complete model for future use
save_path = "/content/drive/MyDrive/whisper_synth_files/whisper_synth"
model.save_pretrained(save_path)

# Also save the processor with the correct tokenizer
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
    tokenizer=tokenizer
)
processor.save_pretrained(save_path)
""")

In [76]:
!source synth_env/bin/activate && python combine_models.py

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [79]:
## SCRIPT FOR EVALUATION (eval.py)

# Create a script to perform evaluation
with open("eval.py", "w") as f:
    f.write("""
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    WhisperTokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
import evaluate
import torchaudio
import pandas as pd
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# Define paths
data_path = '/content/drive/MyDrive/whisper_synth_files/data/dataset1/'

# Load the dataset
data_files = {'train': data_path + 'train.csv'}
dataset = load_dataset('csv', data_files=data_files)

# Split the dataset into training and evaluation sets
_, eval_data = train_test_split(dataset['train'].to_pandas(), test_size=0.2)

# Convert eval data back to Dataset object
eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))

# Load the tokenizer and processor from the combined model directory
save_path = "/content/drive/MyDrive/whisper_synth_files/whisper_synth"

# Load the tokenizer
tokenizer = WhisperTokenizer.from_pretrained(save_path)

# Load processor and model for evaluation
processor = WhisperProcessor.from_pretrained(
    save_path,
    language="en",
    task="transcribe",
    tokenizer=tokenizer
)

# Load the combined model
model = WhisperForConditionalGeneration.from_pretrained(save_path)

# Force the model to generate in English
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="en", task="transcribe"
)

def preprocess_function(examples):
    audio_path = examples['audio']
    audio_array, sampling_rate = torchaudio.load(audio_path)

    # Resample if needed
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        audio_array = resampler(audio_array)

    audio_array = audio_array.squeeze().numpy()

    # Extract input features (log-Mel spectrogram)
    input_features = processor.feature_extractor(
        audio_array, sampling_rate=16000
    ).input_features[0]

    # Tokenize target text to create decoder input IDs
    text = examples['text']
    labels = processor.tokenizer(
        text
    ).input_ids

    # Return a dictionary with the correct keys
    return {
        "input_features": input_features,
        "labels": labels
    }

# Preprocess eval dataset
eval_dataset = eval_dataset.map(preprocess_function, remove_columns=['audio', 'text'])

# Helper function to parse parameters from structured text
def parse_parameters(text):
    params = {}
    lines = text.strip().split('\\n')
    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            params[key.strip()] = value.strip()
    return params

# Define custom compute_metrics function
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Replace -100 with pad_token_id
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and references without special tokens
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    for p, l in zip(pred_str, label_str):
        print("Prediction:", p)
        print("Reference:", l)
        print("---")

    total = len(pred_str)
    exact_matches = sum([1 for p, l in zip(pred_str, label_str) if p.strip() == l.strip()])
    exact_match_accuracy = exact_matches / total

    # Parameter-level accuracy
    parameter_accuracy = {}
    for param in ["Waveform", "Voices", "Oscillator Detune", "Filter Type", "Filter Cutoff", "ADSR Envelope", "LFO Modulation"]:
        correct = 0
        for p, l in zip(pred_str, label_str):
            pred_params = parse_parameters(p)
            label_params = parse_parameters(l)
            if pred_params.get(param) == label_params.get(param):
                correct += 1
        parameter_accuracy[param] = correct / total

    # Combine metrics
    metrics = {"exact_match_accuracy": exact_match_accuracy}
    metrics.update(parameter_accuracy)
    return metrics

# Define the data collator
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Separate input_features and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        labels = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input_features using the feature extractor
        batch = self.processor.feature_extractor.pad(
            input_features,
            return_tensors="pt"
        )

        # Pad labels using the tokenizer
        labels_batch = self.processor.tokenizer.pad(
            labels,
            padding=True,
            return_tensors="pt"
        )

        # Replace padding token id's of the labels by -100
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # Remove the decoder_start_token_id
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

# Initialize the data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

# Define Trainer for evaluation
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/whisper_synth_files/eval_logs",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    eval_strategy="no",  # Set to "no" as we're only evaluating here
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# Perform evaluation
eval_results = trainer.evaluate()

# Print evaluation results
print(eval_results)
""")

In [80]:
# Activate the environment and run the script
!source synth_env/bin/activate && python eval.py

Map: 100% 40/40 [00:01<00:00, 37.21 examples/s]
  trainer = Seq2SeqTrainer(
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.to

In [81]:
## SCRIPT FOR INFERENCE (inference.py)

# Create a script to perform evaluation
with open("inference.py", "w") as f:
    f.write("""
import torch
import torchaudio
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    WhisperTokenizer
)

# Path to your combined model directory
model_dir = "/content/drive/MyDrive/whisper_synth_files/whisper_synth"

# Load the tokenizer
tokenizer = WhisperTokenizer.from_pretrained(model_dir)

# Load the processor with the tokenizer
processor = WhisperProcessor.from_pretrained(
    model_dir,
    language="en",
    task="transcribe",
    tokenizer=tokenizer
)

# Load the combined model
model = WhisperForConditionalGeneration.from_pretrained(model_dir)

# Force the model to generate in English
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="en", task="transcribe"
)

# Set model to evaluation mode and move to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load and process the audio file
audio_path = "/content/drive/MyDrive/whisper_synth_files/data/inference/inference1.wav"  # Replace with the path to your .wav file
audio_array, sampling_rate = torchaudio.load(audio_path)

# Resample to 16 kHz if necessary
if sampling_rate != 16000:
    resampler = torchaudio.transforms.Resample(
        orig_freq=sampling_rate, new_freq=16000
    )
    audio_array = resampler(audio_array)

# If the audio has multiple channels, convert it to mono
if audio_array.shape[0] > 1:
    audio_array = torch.mean(audio_array, dim=0, keepdim=True)

audio_array = audio_array.squeeze().numpy()

# Process audio with the feature extractor to get input features
input_features = processor.feature_extractor(
    audio_array, sampling_rate=16000, return_tensors="pt"
).input_features.to(device)

# Generate synth_patch
with torch.no_grad():
    generated_ids = model.generate(input_features)

# Decode the generated IDs to get the synth_patch
synth_patch = processor.tokenizer.batch_decode(
    generated_ids, skip_special_tokens=True
)[0]
print("Synth Patch:", synth_patch)
""")

In [82]:
# Activate the environment and run the script
!source synth_env/bin/activate && python inference.py

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Synth Patch:  Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndhs Ndh