In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch # type: ignore
import librosa # type: ignore

# Load processor & trained model
model_path = "D:/Speech_recognition/Model_Training/wav2vec2_finetuned"
processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path)

# Move model to GPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Model Loaded Successfully on:", device)

✅ Model Loaded Successfully on: cuda


In [2]:
def transcribe_audio(audio_path):
    # Load audio file
    waveform, _ = librosa.load(audio_path, sr=16000)
    
    # Convert audio to tensor
    input_values = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True).input_values
    input_values = input_values.to(device)

    # Perform inference
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode prediction
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription


Toggle to Speak Feature


In [None]:
import sounddevice as sd # type: ignore
import numpy as np # type: ignore
import wave
import keyboard # type: ignore
import os

recording = False
audio_data = []
samplerate = 16000

def callback(indata, frames, time, status):
    """Callback function to store recorded data in real-time"""
    global audio_data, recording
    if recording:
        audio_data.append(indata.copy())

def record_audio_toggle(output_filename="recorded_audio/recorded.wav"):
    global recording, audio_data
    audio_data = []
    
    # Ensure the "recorded_audio" folder exists
    os.makedirs("recorded_audio", exist_ok=True)

    print("Press 'R' to start/stop recording...")

    with sd.InputStream(samplerate=samplerate, channels=1, dtype=np.int16, callback=callback):
        while True:
            if keyboard.is_pressed("r"):
                recording = not recording  # Toggle recording state
                if recording:
                    print("Recording started...")
                else:
                    print("Recording stopped.")
                    break
                while keyboard.is_pressed("r"):  # Prevent multiple toggles on a single press
                    pass  

    # Prevent saving an empty file
    if not audio_data:
        print("No audio recorded. File not saved.")
        return None

    # Save as WAV file
    audio_np = np.concatenate(audio_data, axis=0)
    with wave.open(output_filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 16-bit audio
        wf.setframerate(samplerate)
        wf.writeframes(audio_np.tobytes())

    print(f"Audio saved as {output_filename}")
    return output_filename


In [5]:
recorded_file = record_audio_toggle()
transcription=transcribe_audio(recorded_file)
print("📝 Transcription:", transcription)

Press 'R' to start/stop recording...
Recording started...
Recording stopped.
Audio saved as recorded_audio/recorded.wav
📝 Transcription: THIS PALE SMELL OF OLD BEER LINGERSIT TAKES HEAD TO BRING OUT THE ODOURA COLD DIPRESTORES HEALTH AND ZUSTA SALT PICKLE TASTES VINE WITH HAINCUCKLES ALL PASTORE ARE MY FAVOURITEA ZSTFUL FOOD IS THE HOT CROSS BUN


Upload File Feature


USING NLP Technique


In [None]:
import os
from langchain_groq import ChatGroq
from dotenv import load_dotenv

# Load API Key
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY")

# ✅ Initialize Groq LLM
groq_llm = ChatGroq(
    model_name="mixtral-8x7b-32768",  # Choose supported model
    groq_api_key=groq_api_key,
    temperature=0.1
)



def batch_refine_transcriptions(asr_outputs, history_file="transcriptions_history.txt"):
    if not asr_outputs:
        return []
    
    # Load previous transcriptions as context if file exists
    previous_transcriptions = []
    if os.path.exists(history_file):
        with open(history_file, "r", encoding="utf-8") as file:
            previous_transcriptions = file.readlines()
    
    # Format transcriptions
    formatted_transcriptions = "\n".join([f"{i+1}. {text}" for i, text in enumerate(asr_outputs)])

    # Create the prompt with history for better corrections
    prompt = f"""
    You are an advanced ASR correction assistant that fixes phonetic and contextual mistakes in transcriptions.
    - Correct phonetic errors 
    - Correct name recognition 
    - Ensure proper grammar while preserving original meaning.
    - Refer to previous corrected transcriptions when necessary.

    Previous transcriptions:
    {"".join(previous_transcriptions)}

    Here are multiple ASR outputs. Correct each one and return ONLY the corrected texts, numbered accordingly:
    {formatted_transcriptions}
    """
    
    # Get response from Groq API
    response = groq_llm.invoke(prompt)
    if not response or not hasattr(response, "content"):
        return ["❌ Error: No response from Groq API"]
    
    corrected_texts = response.content.strip().split("\n")
    corrected_texts = [text.split(". ", 1)[1] if ". " in text else text for text in corrected_texts]

    # Save new transcriptions
    with open(history_file, "a", encoding="utf-8") as file:
        for text in corrected_texts:
            file.write(text + "\n")

    return corrected_texts

In [None]:
corrected_texts = batch_refine_transcriptions(transcription)
for i, corrected in enumerate(corrected_texts):
    print(f"✅ Corrected {i+1}: {corrected}")


✅ Corrected 1: This pale smell of old beer lingers. It takes head to bring out the odor. Cold dip restores health and zest. A salt pickle tastes vinegar with huckleberries. All pastries are my favorite. The most fulfilling food is the hot cross bun.


In [2]:
import sounddevice as sd
import numpy as np
import wave

samplerate = 16000
duration = 5  # seconds
audio_data = []

def callback(indata, frames, time, status):
    audio_data.append(indata.copy())

with sd.InputStream(samplerate=samplerate, channels=1, dtype=np.int16, callback=callback):
    sd.sleep(int(duration * 1000))

if not audio_data:
    print("⚠️ No audio recorded!")
else:
    print("✅ Audio recorded successfully!")


✅ Audio recorded successfully!
