In [1]:
!pip install nemo-toolkit[asr]



In [2]:
# ------------------------------
# 1. Fun√ß√µes auxiliares (ASR)
# ------------------------------
import librosa
import numpy as np
import time
import nemo.collections.asr as nemo_asr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

def carregar_audio(caminho, sr=16000):
    """Carrega arquivo de √°udio e converte para 16 kHz mono."""
    audio, _ = librosa.load(caminho, sr=sr, mono=True)
    return np.array(audio, dtype=np.float32), sr

# ------------------------------
# Fun√ß√£o para modelos NeMo
# ------------------------------
def avaliar_modelo_nemo(modelo_id, caminho_audio):
    """
    Executa infer√™ncia com modelos NeMo e retorna lat√™ncia e transcri√ß√£o.
    """
    print(f"üîΩ Carregando modelo NeMo {modelo_id} ...")
    asr_model = nemo_asr.models.ASRModel.from_pretrained(modelo_id)

    inicio = time.time()
    transcricao_obj = asr_model.transcribe([caminho_audio])[0]
    fim = time.time()

    latencia_ms = (fim - inicio) * 1000
    transcricao_texto = getattr(transcricao_obj, "text", str(transcricao_obj))

    return latencia_ms, transcricao_texto

# ------------------------------
# Fun√ß√£o para modelos Hugging Face
# ------------------------------
def avaliar_modelo_hf(modelo_id, caminho_audio):
    """
    Executa infer√™ncia com modelos Hugging Face (Wav2Vec2/Whisper) e retorna lat√™ncia e transcri√ß√£o.
    """
    print(f"üîΩ Carregando modelo HF {modelo_id} ...")
    processor = Wav2Vec2Processor.from_pretrained(modelo_id)
    model = Wav2Vec2ForCTC.from_pretrained(modelo_id)
    model.eval()

    audio, sr = carregar_audio(caminho_audio, sr=16000)
    input_values = processor(audio, sampling_rate=sr, return_tensors="pt").input_values

    inicio = time.time()
    with torch.no_grad():
        logits = model(input_values).logits
    pred_ids = torch.argmax(logits, dim=-1)
    transcricao_texto = processor.batch_decode(pred_ids)[0]
    fim = time.time()

    latencia_ms = (fim - inicio) * 1000
    return latencia_ms, transcricao_texto

# ------------------------------
# Fun√ß√£o para calcular m√©tricas
# ------------------------------
def calcular_metricas(modelo, latencia, saida, referencia=None, rank=None):
    """
    Retorna dicion√°rio com m√©tricas: Rank, Modelo, Lat√™ncia, Transcri√ß√£o e WER aproximado.
    """
    # Garantir que a sa√≠da seja string
    if not isinstance(saida, str) and hasattr(saida, 'text'):
        saida = saida.text
    elif not isinstance(saida, str):
        saida = str(saida)

    resultado = {
        "Rank": rank if rank is not None else "-",
        "Modelo": modelo,
        "Lat√™ncia (ms)": round(latencia, 2),
        "Transcri√ß√£o": saida,
        "WER aproximado": None  # Sempre presente para evitar KeyError
    }

    if referencia:
        ref_tokens = referencia.lower().split()
        out_tokens = saida.lower().split()
        intersecao = len(set(ref_tokens) & set(out_tokens))
        wer_aprox = 1 - (intersecao / len(ref_tokens)) if ref_tokens else 1.0
        resultado["WER aproximado"] = round(wer_aprox, 3)

    return resultado



      m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
    
      m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
    
      elif re.match('(flt)p?( \(default\))?$', token):
    
      elif re.match('(dbl)p?( \(default\))?$', token):
    


In [None]:

# ------------------------------
# Avalia√ß√£o de modelos ASR espec√≠ficos (PT-BR)
# ------------------------------

import os
import time
import pandas as pd
import librosa
import nemo.collections.asr as nemo_asr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import numpy as np

# ------------------------------
# Fun√ß√µes auxiliares
# ------------------------------

def carregar_audio(caminho, sr=16000):
    """Carrega arquivo de √°udio e converte para 16 kHz mono."""
    audio, _ = librosa.load(caminho, sr=sr, mono=True)
    return np.array(audio, dtype=np.float32), sr

def avaliar_modelo_nemo(modelo_id, caminho_audio):
    """Executa infer√™ncia com modelos NeMo (QuartzNet, Citrinet, FastConformer, Parakeet)."""
    print(f"\nüîπ Avaliando modelo NeMo: {modelo_id}")
    asr_model = nemo_asr.models.ASRModel.from_pretrained(modelo_id)

    inicio = time.time()
    transcricao_obj = asr_model.transcribe([caminho_audio])[0]
    fim = time.time()

    # Extrair texto se for objeto Hypothesis
    transcricao_texto = getattr(transcricao_obj, "text", str(transcricao_obj))
    latencia_ms = (fim - inicio) * 1000
    return latencia_ms, transcricao_texto

def avaliar_modelo_hf(modelo_id, caminho_audio):
    """Executa infer√™ncia com modelos Hugging Face (ex: wav2vec2)."""
    print(f"\nüîπ Avaliando modelo HF: {modelo_id}")
    processor = Wav2Vec2Processor.from_pretrained(modelo_id)
    model = Wav2Vec2ForCTC.from_pretrained(modelo_id)

    audio, sr = carregar_audio(caminho_audio)
    if sr != 16000:
        raise ValueError("O modelo HF requer √°udio em 16 kHz.")

    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    inicio = time.time()
    with torch.no_grad():
        logits = model(inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    fim = time.time()

    transcricao_texto = processor.batch_decode(predicted_ids)[0]
    latencia_ms = (fim - inicio) * 1000
    return latencia_ms, transcricao_texto

def calcular_metricas(modelo, latencia, saida, referencia=None):
    """Retorna m√©tricas b√°sicas para compara√ß√£o de transcri√ß√£o."""
    resultado = {
        "Modelo": modelo,
        "Lat√™ncia (ms)": round(latencia, 2),
        "Transcri√ß√£o": saida
    }

    if referencia:
        if not isinstance(saida, str):
            saida = str(saida)
        ref_tokens = referencia.lower().split()
        out_tokens = saida.lower().split()
        intersecao = len(set(ref_tokens) & set(out_tokens))
        wer_aprox = 1 - (intersecao / len(ref_tokens)) if ref_tokens else 1.0
        resultado["WER aproximado"] = round(wer_aprox, 3)

    return resultado

def limpar_transcricao(x):
    """Extrai texto se for objeto Hypothesis do NeMo."""
    try:
        if isinstance(x, (list, tuple)) and len(x) > 0:
            return getattr(x[0], "text", str(x[0]))
        return getattr(x, "text", str(x))
    except Exception:
        return str(x) if x is not None else ""

# ------------------------------
# Configura√ß√µes
# ------------------------------

audio_teste = "/content/audio3.wav"          #/Caminho para o √°udio
referencia_texto = "este √© um exemplo de fala"

if not os.path.isfile(audio_teste):
    raise FileNotFoundError(f"Arquivo de √°udio n√£o encontrado: {audio_teste}")

# ------------------------------
# Lista de modelos para testar
# ------------------------------
modelos = [
     {"nome": "FastConformer-Hybrid", "id": "nvidia/stt_pt_fastconformer_hybrid_large_pc", "tipo": "nemo"},
     {"nome": "Citrinet-PT-Gamma-0.25", "id": "neongeckocom/stt_pt_citrinet_512_gamma_0_25", "tipo": "nemo"},
     {"nome": "QuartzNet-PT-Ottema", "id": "ottema/stt_pt_quartznet15x5_ctc_small", "tipo": "nemo"},
     {"nome": "QuartzNet-PT", "id": "dominguesm/stt_pt_quartznet15x5_ctc_small", "tipo": "nemo"},
     {"nome": "Parakeet-TDT-0.6b-v3 (multilingual)", "id": "nvidia/parakeet-tdt-0.6b-v3", "tipo": "nemo"},
     {"nome": "wav2vec2-PT-BR-Light", "id": "danielpedrozo/wav2vec2-portuguese-wpp-checkpoint-480", "tipo": "hf"},
]

# ------------------------------
# Execu√ß√£o e coleta de resultados
# ------------------------------
resultados = []

for modelo in modelos:
    try:
        if modelo["tipo"] == "nemo":
            latencia, transcricao = avaliar_modelo_nemo(modelo["id"], audio_teste)
        elif modelo["tipo"] == "hf":
            latencia, transcricao = avaliar_modelo_hf(modelo["id"], audio_teste)
        else:
            print(f"‚ö†Ô∏è Tipo de modelo desconhecido: {modelo['nome']}")
            continue
        metricas = calcular_metricas(modelo["nome"], latencia, transcricao, referencia_texto)
        resultados.append(metricas)
    except Exception as e:
        print(f"‚ùå Erro ao avaliar modelo {modelo['nome']}: {e}")

# ------------------------------
# Cria√ß√£o do DataFrame
# ------------------------------
df_resultados = pd.DataFrame(resultados)

if "Transcri√ß√£o" in df_resultados.columns:
    df_resultados["Transcri√ß√£o"] = df_resultados["Transcri√ß√£o"].apply(limpar_transcricao)

# -------- ORDENA√á√ÉO --------
colunas_ordem = [c for c in ["WER aproximado", "Lat√™ncia (ms)"] if c in df_resultados.columns]
if colunas_ordem:
    df_resultados = df_resultados.sort_values(
        by=colunas_ordem,
        ascending=[True] * len(colunas_ordem),
        na_position="last",
        ignore_index=True
    )

# Adicionar coluna de ranking
df_resultados.insert(0, "Rank", range(1, len(df_resultados) + 1))

# -------- VISUALIZA√á√ÉO NO CONSOLE --------
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

print("\nüîπ Resultados ordenados (melhor WER e menor lat√™ncia):")

# Seleciona apenas colunas que realmente existem
colunas_display = [c for c in ["Rank", "Modelo", "WER aproximado", "Lat√™ncia (ms)", "Transcri√ß√£o"] if c in df_resultados.columns]
print(df_resultados[colunas_display])

# -------- EXPORTA√á√ÉO --------
saida_csv = "resultados_asr.csv"
saida_txt = "resultados_asr.txt"

df_resultados.to_csv(saida_csv, index=False, encoding="utf-8")

with open(saida_txt, "w", encoding="utf-8") as f:
    for _, row in df_resultados.iterrows():
        f.write(f"Modelo: {row.get('Modelo', 'N/A')}\n")
        f.write(f"WER aproximado: {row.get('WER aproximado', 'N/A')}\n")
        f.write(f"Lat√™ncia (ms): {row.get('Lat√™ncia (ms)', 'N/A')}\n")
        f.write(f"Transcri√ß√£o: {row.get('Transcri√ß√£o', '')}\n")
        f.write("-" * 60 + "\n")

print(f"\n‚úÖ Resultados salvos em: {saida_csv} e {saida_txt}")


üîπ Avaliando modelo NeMo: nvidia/stt_pt_fastconformer_hybrid_large_pc


stt_pt_fastconformer_hybrid_large_pc.nem(‚Ä¶):   0%|          | 0.00/453M [00:00<?, ?B/s]

[NeMo W 2025-10-09 10:14:33 nemo_logging:405] Skipped conversion for config/subconfig:
    {'manifest_filepath': '???', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': True, 'num_workers': 8, 'pin_memory': True, 'max_duration': 20, 'min_duration': 0.1, 'is_tarred': False, 'tarred_audio_filepaths': None, 'shuffle_n': 2048, 'bucketing_strategy': 'synced_randomized', 'bucketing_batch_size': None}
     Reason: Missing mandatory value: train_ds.manifest_filepath
        full_key: train_ds.manifest_filepath
        object_type=dict.
[NeMo W 2025-10-09 10:14:33 nemo_logging:405] Skipped conversion for config/subconfig:
    {'manifest_filepath': '???', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': False, 'use_start_end_token': False, 'num_workers': 8, 'pin_memory': True}
     Reason: Missing mandatory value: validation_ds.manifest_filepath
        full_key: validation_ds.manifest_filepath
        object_type=dict.
[NeMo W 2025-10-09 10:14:33 nemo_logging:405] Skipped conversion for con

[NeMo I 2025-10-09 10:14:33 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 128 tokens


[NeMo W 2025-10-09 10:14:33 nemo_logging:405] Skipped conversion for config/subconfig:
    {'manifest_filepath': '???', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': True, 'num_workers': 8, 'pin_memory': True, 'max_duration': 20, 'min_duration': 0.1, 'is_tarred': False, 'tarred_audio_filepaths': None, 'shuffle_n': 2048, 'bucketing_strategy': 'synced_randomized', 'bucketing_batch_size': None}
     Reason: Missing mandatory value: train_ds.manifest_filepath
        full_key: train_ds.manifest_filepath
        object_type=dict.
[NeMo W 2025-10-09 10:14:33 nemo_logging:405] Skipped conversion for config/subconfig:
    {'manifest_filepath': '???', 'sample_rate': 16000, 'batch_size': 32, 'shuffle': False, 'use_start_end_token': False, 'num_workers': 8, 'pin_memory': True}
     Reason: Missing mandatory value: validation_ds.manifest_filepath
        full_key: validation_ds.manifest_filepath
        object_type=dict.
[NeMo W 2025-10-09 10:14:33 nemo_logging:405] Skipped conversion for con

[NeMo I 2025-10-09 10:14:33 nemo_logging:393] PADDING: 0
[NeMo I 2025-10-09 10:14:36 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-10-09 10:14:36 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-10-09 10:14:36 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-10-09 10:14:36 nemo_logging:393] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-10-09 10:14:36 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-10-09 10:14:36 nemo_logging:393] Model EncDecHybridRNNTCTCBPEModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--stt_pt_fastconformer_hybrid_large_pc/snapshots/036da25fa32e3f11c11105448482845950840685/stt_pt_fastconformer_hybrid_large_pc.nemo.


Transcribing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:16<00:00, 16.87s/it]



üîπ Avaliando modelo NeMo: neongeckocom/stt_pt_citrinet_512_gamma_0_25


stt_pt_citrinet_512_gamma_0_25.nemo:   0%|          | 0.00/143M [00:00<?, ?B/s]

[NeMo I 2025-10-09 10:14:57 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 256 tokens


[NeMo W 2025-10-09 10:14:57 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: datasets/cv_neon/manifests/commonvoice_train_manifest_processed.json
    sample_rate: 16000
    batch_size: 32
    trim_silence: false
    max_duration: 9.0
    min_duration: 1.0
    shuffle: true
    use_start_end_token: false
    num_workers: 8
    pin_memory: true
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-10-09 10:14:57 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: datasets/cv_neon/

[NeMo I 2025-10-09 10:14:57 nemo_logging:393] PADDING: 16
[NeMo I 2025-10-09 10:14:59 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /root/.cache/huggingface/hub/models--neongeckocom--stt_pt_citrinet_512_gamma_0_25/snapshots/ea95a18b0eaa1ccaf86faa209dc5c72a4325df51/stt_pt_citrinet_512_gamma_0_25.nemo.


Transcribing:   0%|          | 0/1 [00:00<?, ?it/s][NeMo W 2025-10-09 10:14:59 nemo_logging:405] CTC decoding strategy 'greedy' is slower than 'greedy_batch', which implements the same exact interface. Consider changing your strategy to 'greedy_batch' for a free performance improvement.
Transcribing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.79it/s]


üîπ Avaliando modelo NeMo: ottema/stt_pt_quartznet15x5_ctc_small





stt_pt_quartznet15x5_ctc_small.nemo:   0%|          | 0.00/76.4M [00:00<?, ?B/s]

[NeMo W 2025-10-09 10:15:03 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: manifests/pt/commonvoice_train_manifest_processed.json,manifests/pt/commonvoice_dev_manifest_processed.json
    sample_rate: 16000
    labels:
    - m
    - j
    - w
    - b
    - k
    - v
    - d
    - √≠
    - o
    - √°
    - r
    - f
    - c
    - z
    - √™
    - p
    - √©
    - t
    - u
    - e
    - x
    - 'n'
    - √ß
    - l
    - a
    - s
    - g
    - 'y'
    - i
    - q
    - ' '
    - √£
    - h
    - √≥
    - √µ
    batch_size: 16
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    num_workers: 8
    pin_memory: true
    normalize_transcripts: false
    
[NeMo W 2025-10-09 10:15:03 nemo_logging:405] If you intend to do validation, please call 

[NeMo I 2025-10-09 10:15:03 nemo_logging:393] PADDING: 16
[NeMo I 2025-10-09 10:15:03 nemo_logging:393] Model EncDecCTCModel was successfully restored from /root/.cache/huggingface/hub/models--ottema--stt_pt_quartznet15x5_ctc_small/snapshots/33140911a1f624f7c31526efb909a600d759d83e/stt_pt_quartznet15x5_ctc_small.nemo.


Transcribing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.22it/s]


üîπ Avaliando modelo NeMo: dominguesm/stt_pt_quartznet15x5_ctc_small





stt_pt_quartznet15x5_ctc_small.nemo:   0%|          | 0.00/76.3M [00:00<?, ?B/s]

[NeMo W 2025-10-09 10:15:10 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: manifests/pt/commonvoice_train_manifest_processed.json,manifests/pt/commonvoice_dev_manifest_processed.json
    sample_rate: 16000
    labels:
    - d
    - g
    - p
    - c
    - u
    - e
    - x
    - j
    - k
    - f
    - l
    - o
    - m
    - s
    - r
    - w
    - b
    - a
    - v
    - 'y'
    - i
    - t
    - z
    - h
    - 'n'
    - q
    - ' '
    batch_size: 32
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: /asr_set_1.2/train/train_{0..1023}.tar
    num_workers: 8
    normalize_transcripts: false
    pin_memory: true
    
[NeMo W 2025-10-09 10:15:10 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or

[NeMo I 2025-10-09 10:15:10 nemo_logging:393] PADDING: 16
[NeMo I 2025-10-09 10:15:12 nemo_logging:393] Model EncDecCTCModel was successfully restored from /root/.cache/huggingface/hub/models--dominguesm--stt_pt_quartznet15x5_ctc_small/snapshots/14fecbfd291ade80a3624d5c2399a30be2d6fe49/stt_pt_quartznet15x5_ctc_small.nemo.


Transcribing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.37s/it]



üîπ Avaliando modelo NeMo: nvidia/parakeet-tdt-0.6b-v3


parakeet-tdt-0.6b-v3.nemo:   0%|          | 0.00/2.51G [00:00<?, ?B/s]

[NeMo I 2025-10-09 10:16:28 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 8192 tokens


[NeMo W 2025-10-09 10:16:35 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    use_lhotse: true
    skip_missing_manifest_entries: true
    input_cfg: null
    tarred_audio_filepaths: null
    manifest_filepath: null
    sample_rate: 16000
    shuffle: true
    num_workers: 2
    pin_memory: true
    max_duration: 10.0
    min_duration: 1.0
    text_field: answer
    batch_duration: null
    max_tps: null
    use_bucketing: true
    bucket_duration_bins: null
    bucket_batch_size: null
    num_buckets: 30
    bucket_buffer_size: 20000
    shuffle_buffer_size: 10000
    
[NeMo W 2025-10-09 10:16:35 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    V

[NeMo I 2025-10-09 10:16:35 nemo_logging:393] PADDING: 0
[NeMo I 2025-10-09 10:16:43 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}
[NeMo I 2025-10-09 10:16:43 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-10-09 10:16:43 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-10-09 10:16:43 nemo_logging:393] Using RNNT Loss : tdt
    Loss tdt_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0, 'durations': [0, 1, 2, 3, 4], 'sigma': 0.02, 'omega': 0.1}


[NeMo W 2025-10-09 10:16:43 nemo_logging:405] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: CUDA is not available


[NeMo I 2025-10-09 10:16:56 nemo_logging:393] Model EncDecRNNTBPEModel was successfully restored from /root/.cache/huggingface/hub/models--nvidia--parakeet-tdt-0.6b-v3/snapshots/be0d803fd1970eca8627f5467c208118f0f6c171/parakeet-tdt-0.6b-v3.nemo.


Transcribing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:03<00:00,  3.47s/it]


üîπ Avaliando modelo HF: danielpedrozo/wav2vec2-portuguese-wpp-checkpoint-480





preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/544 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]


üîπ Resultados ordenados (melhor WER e menor lat√™ncia):
   Rank                               Modelo  WER aproximado  Lat√™ncia (ms)  \
0     1               Citrinet-PT-Gamma-0.25           0.333         592.23   
1     2                  QuartzNet-PT-Ottema           0.333         855.79   
2     3  Parakeet-TDT-0.6b-v3 (multilingual)           0.333        3514.44   
3     4                 FastConformer-Hybrid           0.333       16902.87   
4     5                 wav2vec2-PT-BR-Light           0.500        9936.41   
5     6                         QuartzNet-PT           0.667        1406.17   

                                                                         Transcri√ß√£o  
0     um dois tr√™s testando isto √© um exemplo para a transcri√ß√£o de √°udio para texto  
1     um dois tr√™s testando isto √© um exemplo para a transcri√ß√£o de audio para texto  
2  Um dois, tr√™s testando. Isto √© um exemplo para a transcri√ß√£o de √°udio para texto.  
3   Um dos, tr√™s test