Using AccentDB as it has Indian Dialect


In [None]:
import os
import librosa # type: ignore
import numpy as np # type: ignore
import librosa.display # type: ignore
import matplotlib.pyplot as plt # type: ignore

In [None]:
DATASET_PATH = r"D:\Speech_recognition\AudioFiles\indian"  

def load_audio_files(path):
    """Get a list of all WAV files in the dataset directory."""
    audio_files = []
    for root, _, files in os.walk(path):
        for file in files:
            if file.endswith(".wav"):
                audio_files.append(os.path.join(root, file))
    return audio_files

# Get all audio file paths
audio_files = load_audio_files(DATASET_PATH)
print(f"Found {len(audio_files)} audio files.")

In [None]:
def extract_spectrogram(audio_path, target_sr=16000):
    """Load an audio file, convert to spectrogram."""
    waveform, sample_rate = librosa.load(audio_path, sr=target_sr)
    spectrogram = librosa.feature.melspectrogram(y=waveform, sr=sample_rate, n_mels=128)
    log_spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    return log_spectrogram

sample_spectrogram = extract_spectrogram(audio_files[0])

plt.figure(figsize=(10, 4))
librosa.display.specshow(sample_spectrogram, x_axis="time", y_axis="mel", sr=16000)
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram of Sample Audio")
plt.show()


In [None]:
OUTPUT_DIR = "D:/Speech_recognition/Processed_Spectrograms/american"

os.makedirs(OUTPUT_DIR, exist_ok=True) 

for i, file in enumerate(audio_files):
    spec = extract_spectrogram(file)
    np.save(os.path.join(OUTPUT_DIR, f"spec_{i}.npy"), spec)
    print(f"Saved spectrogram {i+1}/{len(audio_files)}")



In [None]:
import torch # type: ignore
import torchaudio # type: ignore
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

In [None]:
# pre-trained Wav2Vec2 model and tokenizer
model_name = "facebook/wav2vec2-large-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import librosa # type: ignore

def transcribe_audio(audio_path):
    # Loading audio and convert to 16kHz (required by Wav2Vec2)
    waveform, sample_rate = librosa.load(audio_path, sr=16000)

    # Converting waveform to tensor
    input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
    input_values = input_values.to(device)

    # Generate transcription
    with torch.no_grad():
        logits = model(input_values).logits

    # Decode the output
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

# Example usage
audio_file = "D:/Speech_recognition/AudioFiles/indian/speaker_01/indian_s01_001.wav"  # Change this path
transcription = transcribe_audio(audio_file)
print(f"Transcription: {transcription}")

In [None]:
import os
import json

OUTPUT_TRANSCRIPTIONS = "D:/Speech_recognition/transcriptions.json"

DATASET_PATH = "D:/Speech_recognition/AudioFiles/indian"
audio_files = [os.path.join(root, file) for root, _, files in os.walk(DATASET_PATH) for file in files if file.endswith(".wav")]
transcriptions = {}

for i, file in enumerate(audio_files):
    print(f"Processing {i+1}/{len(audio_files)}: {file}")
    transcript = transcribe_audio(file)
    transcriptions[file] = transcript

#results saved to json
with open(OUTPUT_TRANSCRIPTIONS, "w") as f:
    json.dump(transcriptions, f, indent=4)

print(f"✅ Transcriptions saved to {OUTPUT_TRANSCRIPTIONS}")
