In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch # type: ignore
import librosa # type: ignore

# Load processor & trained model
model_path = "D:/Speech_recognition/wav2vec2_finetuned"
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path)

# Move model to GPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=1024, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder

In [8]:
def transcribe_audio(audio_path):
    # Load audio file
    waveform, _ = librosa.load(audio_path, sr=16000)
    
    # Convert audio to tensor
    input_values = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True).input_values
    input_values = input_values.to(device)

    # Perform inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode prediction
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

In [9]:
audio_path = "D:/Speech_recognition/harvard.wav"  #actual audio file
transcription = transcribe_audio(audio_path)
print("📝 Transcription:", transcription)

📝 Transcription: THE STALE SMELL OF OLD BEER LINGERSIT TAKES HEAT TO BRING OUT THE ODOURA COLD DIPRESTORES HEALTH AND ZESTA SALT PICKLE TASTES FINE WITH HAMTUCKLES ALL PASTORE ARE MY FAVOURITE A ZESTFUL FOOD IS THE HOT CROSS BUN
