In [1]:
import torch

# 1. Get the name of the GPU
print(f"Device: {torch.cuda.get_device_name(0)}")

# 2. Memory currently occupied by Tensors
print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

# 3. Memory reserved by PyTorch caching allocator (Total memory taken from OS)
print(f"Reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

# 4. To see a full summary report
print(torch.cuda.memory_summary(device=None, abbreviated=False))

Device: NVIDIA RTX A1000 Laptop GPU
Allocated: 0.00 MB
Reserved: 0.00 MB
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |     

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float16

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=1,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "korean", "task": "transcribe"},
    return_timestamps=True,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 587/587 [00:01<00:00, 472.65it/s, Materializing param=model.encoder.layers.31.self_attn_layer_norm.weight] 
`torch_dtype` is deprecated! Use `dtype` instead!
Passing `generation_config` together with generation-related arguments=({'return_timestamps'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> will take precedence. Please check the docstring of <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> to see related `.g

 mr quilter is the apostle of the middle classes and we are glad to welcome his gospel nor is mr quilters manner less interesting than his matter he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind He has grave doubts whether Sir Frederick Layton's work is really Greek after all and can discover in it but little of rocky Ithaca Linnell's pictures are a sort of up guards and Adam paintings and Mason's exquisite idylls are as national as a jingo poem mr. Burkett Foster's landscapes smile at one much in the same way that mr. karker 아름다운 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은 젠장은

In [8]:
import librosa
import re
def inference(path: str):
    audio, sr = librosa.load(path, sr=16000)
    result = pipe(audio)
    text_clean = re.sub(r'\.', '', result['text'])
    return text_clean

In [9]:
print(inference("data/scope_phoneme_data/A long/A long 1.wav"))

 에이


In [10]:
import os
import pandas as pd

data_path = "data/scope_phoneme_data"
data_records = []

for folder_name in os.listdir(data_path):
    folder_path = os.path.join(data_path, folder_name)
    
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                file_path = os.path.join(folder_path, file_name)
                phonemes = inference(file_path)
                
                data_records.append({
                    "Folder/Label": folder_name,
                    "File Name": file_name,
                    "Predicted Phonemes": phonemes
                })

df = pd.DataFrame(data_records)
print(df.head())

df.to_csv("phoneme_whisper_results.csv", index=False)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used 

  Folder/Label File Name Predicted Phonemes
0            B   B 1.wav              B B B
1            B   B 4.wav                  P
2            B   B 2.wav                  b
3            B   B 3.wav                  P
4            Q   Q 1.wav                  그
