In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

# Load processor & trained model
model_path = "D:/Speech_recognition/wav2vec2_finetuned"
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Model Loaded Successfully on:", device)

✅ Model Loaded Successfully on: cuda


In [2]:
def transcribe_audio(audio_path):
    # Load audio file
    waveform, _ = librosa.load(audio_path, sr=16000)
    
    # Convert audio to tensor
    input_values = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True).input_values
    input_values = input_values.to(device)

    # Perform inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode prediction
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription


In [3]:
audio_path = "D:/Speech_recognition/Test.wav"  # Replace with an actual audio file
transcription = transcribe_audio(audio_path)
print("📝 Transcription:", transcription)


📝 Transcription: THE VAMP OF THE SHOE HAD A GOLD BUCKLE


USING NLP Technique


In [19]:
import os
from langchain_groq import ChatGroq
from dotenv import load_dotenv

# Load API Key
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

# Initialize Groq LLM with optimized parameters
groq_llm = ChatGroq(
    model_name="mistral-saba-24b",  
    groq_api_key=groq_api_key,
    temperature=0.1  # Lower temp for accuracy
)

def batch_refine_transcriptions(asr_outputs):
    """Processes multiple ASR transcriptions at once using Groq LLM."""
    
    if not asr_outputs:
        return []  # Return empty list if input is empty

    # Format ASR transcriptions into a numbered list
    formatted_transcriptions = "\n".join([f"{i+1}. {text}" for i, text in enumerate(asr_outputs)])

    # Define the LLM prompt inside the function
    prompt = f"""
    You are an advanced ASR correction assistant that fixes phonetic and contextual mistakes in transcriptions.
    - Correct phonetic errors (e.g., 'sake fully ter' → 'sexual intercourse')
    - Correct name recognition (e.g., 'Redgie' → 'Vraj')
    - Ensure proper grammar while preserving original meaning.

    Here are multiple ASR outputs. Correct each one and return ONLY the corrected texts, numbered accordingly:

    {formatted_transcriptions}
    """

    # Invoke Groq LLM
    response = groq_llm.invoke(prompt)
    
    # Ensure response content is not None
    if not response or not hasattr(response, "content"):
        return ["❌ Error: No response from Groq API"]

    # Split output by lines and remove numbering
    corrected_texts = response.content.strip().split("\n")
    return [text.split(". ", 1)[1] if ". " in text else text for text in corrected_texts]  # Remove numbering


In [20]:
asr_outputs = [
    "I HEAD SAKE FULLY TER COUSE WITH REDGIE'S MOTHER",
]

corrected_texts = batch_refine_transcriptions(asr_outputs)

# Print corrected outputs
for i, corrected in enumerate(corrected_texts):
    print(f"✅ Corrected {i+1}: {corrected}")


✅ Corrected 1: I had sexual intercourse with Vraj's mother
