In [1]:
import os
import sys
from tqdm import tqdm
import torchaudio
import pandas as pd
from transformers import WhisperProcessor, WhisperForConditionalGeneration
sys.path.append(os.path.join('..', 'utils'))
from audio_preprocess import folder_walker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'whisper-small'

load_model = 'openai/' + model_name
save_path_model = 'models/' + model_name
full_save_path_model = '../data/' + save_path_model

# check that the save path exists
if not os.path.exists(full_save_path_model):
    os.makedirs(full_save_path_model)

paths = folder_walker('../data/audio/resampled')

In [3]:
# load model and processor
processor = WhisperProcessor.from_pretrained(load_model)
model = WhisperForConditionalGeneration.from_pretrained(load_model)
model.config.forced_decoder_ids = None

preprocessor_config.json: 100%|██████████| 185k/185k [00:00<00:00, 92.2MB/s]
tokenizer_config.json: 100%|██████████| 805/805 [00:00<00:00, 5.48MB/s]
vocab.json: 100%|██████████| 836k/836k [00:00<00:00, 2.25MB/s]
tokenizer.json: 100%|██████████| 2.48M/2.48M [00:00<00:00, 12.2MB/s]
merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 1.81MB/s]
normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 73.4MB/s]
added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 393kB/s]
special_tokens_map.json: 100%|██████████| 2.08k/2.08k [00:00<00:00, 9.19MB/s]
config.json: 100%|██████████| 1.97k/1.97k [00:00<00:00, 4.15MB/s]
model.safetensors: 100%|██████████| 967M/967M [00:30<00:00, 32.1MB/s] 
generation_config.json: 100%|██████████| 3.87k/3.87k [00:00<00:00, 7.10MB/s]


In [4]:
for path in tqdm(paths):
    # create save path by replacing audio/resampled
    save_path = path.replace('audio/resampled', save_path_model)
    save_path = save_path.replace('.wav', '.txt')

    # if the path already exists, skip
    if os.path.exists(save_path):
        continue
    
    # load in audio
    waveform, sample_rate = torchaudio.load(path)
    waveform_np = waveform.squeeze().numpy()

    # load data
    input_features = processor(waveform_np, sampling_rate=sample_rate, return_tensors="pt").input_features 

    # generate token ids
    predicted_ids = model.generate(input_features)

    # decode token ids to text
    result = processor.batch_decode(predicted_ids, skip_special_tokens=True)

    # save the string to a file
    with open(save_path, 'w') as f:
        f.write(result[0])

100%|██████████| 95/95 [07:58<00:00,  5.03s/it]


In [5]:
paths = folder_walker(full_save_path_model)

# Initialize a list to store the data
data = []

# Loop through each file path
for path in paths:
    # Read the content of the file
    with open(path, 'r', encoding='utf-8') as file:
        content = file.read()
        variable_path = path.replace(full_save_path_model + '/', '')
        # remove the .txt extension
        variable_path = variable_path[:-4]
        # Append both the content and the file path to the data list
        data.append([content, variable_path])

# Create a DataFrame with the data
df = pd.DataFrame(data, columns=[model_name, 'file'])

# Save the DataFrame as a CSV file
df.to_csv('../data/models/' + model_name + '.csv', index=False)

# Delete all the paths in the os paths
for path in paths:
    os.remove(path)

os.rmdir(full_save_path_model)