In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ['HF_HOME'] = ''
import librosa
from tqdm import tqdm
import pandas as pd
from datasets import Dataset, DatasetDict
import torchaudio
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration,WhisperTokenizer
from safetensors.torch import load_file
import torch

## Datasets

In [None]:
# Datasets
def load_audio_data(file_path):
    audio_array, sampling_rate = librosa.load(file_path, sr=16000)
    return audio_array, sampling_rate


def create_dataset(csv_file, max_samples=None):
    processed_data = []
    
    df = pd.read_csv(csv_file)
    if max_samples is not None:
        df = df.head(max_samples) 
        
    for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="prcessing data"):
        audio_path = row['audio_path']
        label = row['label']
        audio_array, sampling_rate = load_audio_data(audio_path)
        item = {
            'audio': {
                'path': audio_path,
                'array': audio_array,
                'sampling_rate': sampling_rate
            },
            'sentence': label
        }
        processed_data.append(item)
    
    dataset = Dataset.from_pandas(pd.DataFrame(processed_data))
    return dataset



csv_file = f'/csv file for dataset/' 
dataset = create_dataset(csv_file, max_samples=1000)

print('loading dataset....')
print(dataset)

#### Preview

In [None]:
import numpy as np
import IPython.display as ipd

index = 3
audio_sample = dataset[index]['audio']
text = dataset[index]['sentence']

print(f"{text}")
audio = np.array(audio_sample['array'])
ipd.display(ipd.Audio(audio, rate=16000, normalize=False))


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large-v3-turbo", language="en", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3-turbo")

level = "word"  # "word" or "phn"

if level == "word":
    new_tokens = ["[REP]", "[PAU]", "[INS]"]
else:
    new_tokens = ["[REP]",  "[PRO]", "[PAU]"]

tokenizer.add_tokens(list(new_tokens))
model.resize_token_embeddings(len(tokenizer))

print("loading model weights....")
state_dict = load_file(f'...../model.safetensors')  # set path to your safetensors file
model.load_state_dict(state_dict, strict=False)
model.to(device) 

In [None]:
input_features = processor(
    audio_sample['array'], sampling_rate=audio_sample["sampling_rate"], return_tensors="pt"
).input_features

input_features = input_features.to(device)

predicted_ids = model.generate(input_features, language='en')
print(predicted_ids)

transcription = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
print(transcription)