In [1]:
import torch
print(torch.cuda.is_available())
# 1. Get the name of the GPU
print(f"Device: {torch.cuda.get_device_name(0)}")

# 2. Memory currently occupied by Tensors
print(f"Allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

# 3. Memory reserved by PyTorch caching allocator (Total memory taken from OS)
print(f"Reserved: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

# 4. To see a full summary report
print(torch.cuda.memory_summary(device=None, abbreviated=False))

True
Device: NVIDIA RTX A1000 Laptop GPU
Allocated: 0.00 MB
Reserved: 0.00 MB
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |

# OpenAI Whisper-large-v3-turbo

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float16

# model_id = "openai/whisper-large-v3-turbo"
model_id = "openai/whisper-tiny"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=1,
    dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "korean", "task": "transcribe"},
    return_timestamps=True,
)

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

result = pipe(sample)
print(result["text"])

In [None]:
import librosa
import re
def inference_turbo(path: str):
    audio, sr = librosa.load(path, sr=16000)
    result = pipe(audio)
    text_clean = re.sub(r'\.', '', result['text'])
    return text_clean

In [None]:
print(inference_turbo("data/scope_phoneme_data/A long/A long 1.wav"))

In [None]:
import os
import pandas as pd

data_path = "data/scope_phoneme_data"
data_records = []

for folder_name in os.listdir(data_path):
    folder_path = os.path.join(data_path, folder_name)
    
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                file_path = os.path.join(folder_path, file_name)
                phonemes = inference(file_path)
                
                data_records.append({
                    "Folder/Label": folder_name,
                    "File Name": file_name,
                    "Predicted Phonemes": phonemes
                })

df = pd.DataFrame(data_records)
print(df.head())

df.to_csv("phoneme_whisper_results.csv", index=False)

# OpenAI Whisper-tiny

In [4]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
print(type(sample["array"]), sample["array"].shape, sample["sampling_rate"])
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


Loading weights: 100%|██████████| 167/167 [00:00<00:00, 414.10it/s, Materializing param=model.encoder.layers.3.self_attn_layer_norm.weight]  


<class 'numpy.ndarray'> (93680,) 16000


In [8]:
import librosa
def inference_tiny(path: str):
    audio, sr = librosa.load(path, sr=16000)
    input_features = processor(audio, sampling_rate=sr, return_tensors="pt").input_features 
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

In [10]:
import os
import pandas as pd

data_path = "data/scope_phoneme_data"
data_records = []

for folder_name in os.listdir(data_path):
    folder_path = os.path.join(data_path, folder_name)
    
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                file_path = os.path.join(folder_path, file_name)
                phonemes = inference_tiny(file_path)
                
                data_records.append({
                    "Folder/Label": folder_name,
                    "File Name": file_name,
                    "Predicted Phonemes": phonemes
                })

df = pd.DataFrame(data_records)
print(df.head())

df.to_csv("phoneme_whisper_tiny_results.csv", index=False)

  Folder/Label File Name Predicted Phonemes
0            B   B 1.wav                 끝!
1            B   B 4.wav               Bye!
2            B   B 2.wav                 안녕
3            B   B 3.wav               Bye!
4            Q   Q 1.wav               그...
