# This is the speaking part of GenIELTS, a phonemes error detection system

### First we should convert audio input to a text

In [1]:
import torch
import torchaudio
from transformers import AutoTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor, BitsAndBytesConfig
import librosa
import numpy as np
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "openai/whisper-small.en"

# Load processor
processor = AutoProcessor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)


model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
model.to(device)



WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

In [3]:
def audio_to_text(audio_file_path, sample_rate=16000):
    """
    Convert audio file to text using Whisper model correctly
    
    Args:
        audio_file_path (str): Path to the audio file
        sample_rate (int): Target sample rate for the model
    
    Returns:
        str: Transcribed text
    """
    # Load and preprocess audio
    audio, sr = librosa.load(audio_file_path, sr=sample_rate)
    
    # Process audio with the model's processor
    inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
    
    # Move inputs to device
    if torch.cuda.is_available():
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate transcription using the proper generation method
    with torch.no_grad():
        # Use model.generate() instead of direct forward pass
        generated_ids = model.generate(
            inputs["input_features"],
            max_length=448,
            do_sample=False,
            temperature=0.0,
            return_timestamps=False
        )
    
    # Decode the generated IDs to text
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return transcription

def audio_to_text_chunked(audio_file_path, sample_rate=16000, chunk_length_s=30):
    """
    Convert audio file to text using Whisper model with chunking.
    
    Args:
        audio_file_path (str): Path to the audio file
        sample_rate (int): Target sample rate for the model
        chunk_length_s (int): Length of audio chunks in seconds
    
    Returns:
        str: Transcribed text
    """
    # Load audio
    audio, sr = librosa.load(audio_file_path, sr=sample_rate)
    
    chunk_size = chunk_length_s * sample_rate
    num_chunks = (len(audio) + chunk_size - 1) // chunk_size
    
    full_transcription = ""
    
    for i in tqdm(range(num_chunks), desc=f"Chunking {audio_file_path}"):
        start = i * chunk_size
        end = start + chunk_size
        chunk = audio[start:end]
        
        # Process audio with the model's processor
        inputs = processor(chunk, sampling_rate=sample_rate, return_tensors="pt")

        # Move inputs to device
        if torch.cuda.is_available():
            inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate transcription using the proper generation method
        with torch.no_grad():
            generated_ids = model.generate(
                inputs["input_features"],
                max_length=448,
                do_sample=False,
                temperature=0.0,
                return_timestamps=False
            )

        # Decode the generated IDs to text
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        full_transcription += transcription + " "
        
    return full_transcription.strip()

In [4]:
# Alternative approach using Transformers pipeline (more robust)
from transformers import pipeline

def audio_to_text_pipeline(audio_file_path, chunk_length_s=30):
    """
    Convert audio file to text using Transformers pipeline (most reliable method)
    
    Args:
        audio_file_path (str): Path to the audio file
        chunk_length_s (int): Length of audio chunks in seconds
    
    Returns:
        str: Transcribed text
    """
    try:
        # Create a speech recognition pipeline
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model_id,
            device=0 if torch.cuda.is_available() else -1,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            chunk_length_s=chunk_length_s,
            return_timestamps=True  # Enable timestamps for better alignment
        )
        
        # Process the audio file
        result = pipe(audio_file_path)
        
        # Extract text from result
        if isinstance(result, dict):
            return result.get("text", "")
        elif isinstance(result, list):
            return " ".join([chunk.get("text", "") for chunk in result])
        else:
            return str(result)
            
    except Exception as e:
        print(f"Pipeline method failed: {e}")
        print("Falling back to manual method...")
        return audio_to_text_chunked(audio_file_path, chunk_length_s=chunk_length_s)

def audio_to_text_simple(audio_file_path):
    """
    Simple audio to text conversion using librosa and basic processing
    
    Args:
        audio_file_path (str): Path to the audio file
    
    Returns:
        str: Transcribed text
    """
    try:
        # Try pipeline method first (most reliable)
        return audio_to_text_pipeline(audio_file_path)
    except Exception as e:
        print(f"Pipeline failed, trying chunked method: {e}")
        try:
            # Fallback to chunked method
            return audio_to_text_chunked(audio_file_path)
        except Exception as e2:
            print(f"Chunked method failed: {e2}")
            # Final fallback
            return audio_to_text(audio_file_path)

In [5]:
# Test audio transcription functionality
print("Testing Audio Transcription Methods")
print("=" * 50)

# Check if audio file exists
import os
audio_files = ["Recording.wav", "test.wav", "artest.wav"]
available_files = [f for f in audio_files if os.path.exists(f)]

if available_files:
    test_file = available_files[0]
    print(f"Testing with: {test_file}")
    
    try:
        # Test the simple method
        print("\n1. Testing audio_to_text_simple()...")
        transcript = audio_to_text_simple(test_file)
        print(f"✓ Success! Transcript: '{transcript[:100]}...'")
        
    except Exception as e:
        print(f"❌ Error in audio_to_text_simple: {e}")
        
        try:
            # Test pipeline method directly
            print("\n2. Testing pipeline method...")
            transcript = audio_to_text_pipeline(test_file)
            print(f"✓ Pipeline success! Transcript: '{transcript[:100]}...'")
            
        except Exception as e2:
            print(f"❌ Pipeline error: {e2}")
            
            try:
                # Test basic method
                print("\n3. Testing basic method...")
                transcript = audio_to_text(test_file)
                print(f"✓ Basic method success! Transcript: '{transcript[:100]}...'")
                
            except Exception as e3:
                print(f"❌ All methods failed: {e3}")
                print("Please check your model and audio file setup.")
                
else:
    print("❌ No audio files found for testing.")
    print("Available files in directory:", os.listdir("."))

print("=" * 50)

Testing Audio Transcription Methods
Testing with: Recording.wav

1. Testing audio_to_text_simple()...


Device set to use cuda:0


Pipeline method failed: ffmpeg was not found but is required to load audio files from filename
Falling back to manual method...


Chunking Recording.wav:   0%|          | 0/1 [00:00<?, ?it/s]Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Chunking Recording.wav: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]

✓ Success! Transcript: 'How are you? What are you doing? I would like...'





In [6]:
import csv
import re

def parse_britfone(file_path):
    """
    Parses the Britfone csv file into a dictionary.
    """
    pronunciations = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for row in reader:
            word = row[0].lower()
            phonemes = row[1]
            
            # Handle multiple pronunciations (e.g., word(1))
            word = re.sub(r'\(\d+\)', '', word)
            
            if word in pronunciations:
                if phonemes not in pronunciations[word]:
                    pronunciations[word].append(phonemes)
            else:
                pronunciations[word] = [phonemes]
    return pronunciations

# Parse the downloaded Britfone file
britfone_lexicon = parse_britfone("britfone.main.3.0.1.csv")


In [7]:

from g2p_en import G2p

# Initialize the G2P converter
g2p = G2p()

def alternative_g2p(word):
    """
    Alternative G2P using g2p-en library (doesn't require espeak).
    """
    try:
        phonemes = g2p(word)
        # Convert to a format similar to britfone
        return ' '.join(phonemes)
    except Exception as e:
        print(f"G2P failed for '{word}': {e}")
        return word.lower()

def get_phonemes_enhanced(word, lexicon):
    """
    Enhanced phoneme lookup with multiple fallback options.
    """
    word = word.lower()
    if word in lexicon:
        return lexicon[word][0]  # Return the first pronunciation from Britfone
    else:
        # Try alternative G2P first
        try:
            return alternative_g2p(word)
        except Exception as e:
            print(f"Alternative G2P failed for '{word}': {e}")
            # Final fallback - simple phonetic approximation
            return word.lower()

In [8]:
def transcript_to_phonemes(transcript, lexicon):
    """
    Convert transcript text to phonemes and print the results.
    
    Args:
        transcript (str): The transcribed text from audio
        lexicon (dict): The phoneme dictionary (britfone_lexicon)
    """
    # Clean and split the transcript into words
    words = re.sub(r'[^\w\s]', '', transcript.lower()).split()
    
    print(f"Original transcript: {transcript}")
    print(f"Number of words: {len(words)}")
    print("-" * 60)
    
    phoneme_results = []
    
    for i, word in enumerate(words, 1):
        phonemes = get_phonemes_enhanced(word, lexicon)
        phoneme_results.append((word, phonemes))
        print(f"{i:2d}. Word: '{word}' -> Phonemes: [{phonemes}]")
    
    print("-" * 60)
    print("Complete phoneme sequence:")
    print(" | ".join([phonemes for word, phonemes in phoneme_results]))
    
    return phoneme_results

# Example usage with audio transcription
def process_audio_to_phonemes(audio_file_path):
    """
    Complete pipeline: Audio -> Transcript -> Phonemes
    
    Args:
        audio_file_path (str): Path to the audio file
    
    Returns:
        list: List of (word, phonemes) tuples
    """
    print(f"Processing audio file: {audio_file_path}")
    print("=" * 60)
    
    # Step 1: Convert audio to text using the most reliable method
    try:
        transcript = audio_to_text_simple(audio_file_path)
    except Exception as e:
        print(f"All audio transcription methods failed: {e}")
        print("Please check your audio file and model setup.")
        return "", []
    
    # Step 2: Convert transcript to phonemes
    phoneme_results = transcript_to_phonemes(transcript, britfone_lexicon)
    
    return transcript, phoneme_results

In [9]:
# IPA to ARPABET and ARPABET to IPA mapping dictionaries
# Based on CMU Pronouncing Dictionary and standard phonetic mappings

IPA_TO_ARPABET = {
    # Vowels
    'i': 'IY',      # beat
    'ɪ': 'IH',      # bit
    'e': 'EY',      # bait
    'ɛ': 'EH',      # bet
    'æ': 'AE',      # bat
    'ɑ': 'AA',      # bot
    'ɔ': 'AO',      # bought
    'o': 'OW',      # boat
    'ʊ': 'UH',      # book
    'u': 'UW',      # boot
    'ʌ': 'AH',      # but
    'ə': 'AH',      # about (schwa -> AH)
    'ɚ': 'ER',      # butter
    'ɝ': 'ER',      # bird
    
    # Diphthongs
    'aɪ': 'AY',     # bite
    'aʊ': 'AW',     # bout
    'ɔɪ': 'OY',     # boy
    'eɪ': 'EY',     # bait
    'oʊ': 'OW',     # boat
    'ɪə': 'IH R',   # beer
    'ɛə': 'EH R',   # bear
    'ʊə': 'UH R',   # tour
    
    # Consonants
    'p': 'P',       # pat
    'b': 'B',       # bat
    't': 'T',       # tat
    'd': 'D',       # dad
    'k': 'K',       # cat
    'g': 'G',       # gap
    'f': 'F',       # fat
    'v': 'V',       # vat
    'θ': 'TH',      # think
    'ð': 'DH',      # that
    's': 'S',       # sat
    'z': 'Z',       # zap
    'ʃ': 'SH',      # ship
    'ʒ': 'ZH',      # measure
    'h': 'HH',      # hat
    'm': 'M',       # mat
    'n': 'N',       # nat
    'ŋ': 'NG',      # sing
    'l': 'L',       # lat
    'r': 'R',       # rat
    'w': 'W',       # way
    'j': 'Y',       # yet
    
    # Affricates
    'tʃ': 'CH',     # church
    'dʒ': 'JH',     # judge
    
    # Additional symbols
    'ʔ': '',        # glottal stop (often omitted in ARPABET)
    ' ': ' ',       # word boundary
    '.': '',        # syllable boundary (omitted in ARPABET)
}

# Create reverse mapping
ARPABET_TO_IPA = {v: k for k, v in IPA_TO_ARPABET.items() if v != ''}

# Handle special cases for reverse mapping
ARPABET_TO_IPA.update({
    'IY': 'i',
    'IH': 'ɪ', 
    'EY': 'eɪ',
    'EH': 'ɛ',
    'AE': 'æ',
    'AA': 'ɑ',
    'AO': 'ɔ',
    'OW': 'oʊ',
    'UH': 'ʊ',
    'UW': 'u',
    'AH': 'ʌ',  # Primary mapping for AH
    'ER': 'ɝ',
    'AY': 'aɪ',
    'AW': 'aʊ',
    'OY': 'ɔɪ',
    'HH': 'h',
    'CH': 'tʃ',
    'JH': 'dʒ',
    'TH': 'θ',
    'DH': 'ð',
    'SH': 'ʃ',
    'ZH': 'ʒ',
    'NG': 'ŋ',
    'Y': 'j'
})

def ipa_to_arpabet(ipa_string):
    """
    Convert IPA phoneme string to ARPABET format.
    
    Args:
        ipa_string (str): IPA phoneme string (space-separated or continuous)
    
    Returns:
        str: ARPABET phoneme string (space-separated)
    """
    if not ipa_string:
        return ""
    
    # Handle both space-separated and continuous IPA strings
    result = []
    i = 0
    
    while i < len(ipa_string):
        # Try to match longer sequences first (for diphthongs and affricates)
        matched = False
        
        # Check for 3-character sequences
        if i + 2 < len(ipa_string):
            three_char = ipa_string[i:i+3]
            if three_char in IPA_TO_ARPABET:
                arpabet = IPA_TO_ARPABET[three_char]
                if arpabet:
                    result.append(arpabet)
                i += 3
                matched = True
        
        # Check for 2-character sequences
        if not matched and i + 1 < len(ipa_string):
            two_char = ipa_string[i:i+2]
            if two_char in IPA_TO_ARPABET:
                arpabet = IPA_TO_ARPABET[two_char]
                if arpabet:
                    result.append(arpabet)
                i += 2
                matched = True
        
        # Check for single character
        if not matched:
            char = ipa_string[i]
            if char in IPA_TO_ARPABET:
                arpabet = IPA_TO_ARPABET[char]
                if arpabet:
                    result.append(arpabet)
            elif char == ' ':
                # Handle word boundaries
                if result and result[-1] != '|':
                    result.append('|')  # Word boundary marker
            i += 1
    
    return ' '.join(result)

def arpabet_to_ipa(arpabet_string):
    """
    Convert ARPABET phoneme string to IPA format.
    
    Args:
        arpabet_string (str): ARPABET phoneme string (space-separated)
    
    Returns:
        str: IPA phoneme string
    """
    if not arpabet_string:
        return ""
    
    # Split by spaces and convert each phoneme
    phonemes = arpabet_string.strip().split()
    ipa_result = []
    
    for phoneme in phonemes:
        # Remove stress markers (0, 1, 2) from vowels
        clean_phoneme = phoneme.rstrip('012')
        
        if clean_phoneme in ARPABET_TO_IPA:
            ipa_result.append(ARPABET_TO_IPA[clean_phoneme])
        elif clean_phoneme == '|':
            ipa_result.append(' ')  # Word boundary
        else:
            # If phoneme not found, keep as is (might be a variant)
            ipa_result.append(clean_phoneme.lower())
    
    return ''.join(ipa_result)

def convert_phonemes_for_alignment(phoneme_results, output_format='arpabet'):
    """
    Convert phoneme results to specified format for force alignment.
    
    Args:
        phoneme_results (list): List of (word, phonemes) tuples
        output_format (str): 'arpabet' or 'ipa'
    
    Returns:
        list: List of (word, converted_phonemes) tuples
    """
    converted_results = []
    
    for word, phonemes in phoneme_results:
        if output_format.lower() == 'arpabet':
            # Assume input is IPA, convert to ARPABET
            converted = ipa_to_arpabet(phonemes)
        elif output_format.lower() == 'ipa':
            # Assume input is ARPABET, convert to IPA
            converted = arpabet_to_ipa(phonemes)
        else:
            converted = phonemes  # No conversion
        
        converted_results.append((word, converted))
    
    return converted_results

def display_phoneme_comparison(word, ipa_phonemes, arpabet_phonemes):
    """
    Display a comparison of IPA and ARPABET phonemes for a word.
    
    Args:
        word (str): The word
        ipa_phonemes (str): IPA phoneme representation
        arpabet_phonemes (str): ARPABET phoneme representation
    """
    print(f"Word: '{word}'")
    print(f"  IPA:     [{ipa_phonemes}]")
    print(f"  ARPABET: [{arpabet_phonemes}]")
    print("-" * 40)


In [10]:
# Let's test the conversion functions
if 'transcript' in locals() and transcript:
    # Process the first few words for demonstration
    sample_phonemes = transcript_to_phonemes(transcript.split('.')[0], britfone_lexicon)
    
    # Convert to ARPABET
    arpabet_results = convert_phonemes_for_alignment(sample_phonemes, 'arpabet')
    
    # Display comparison
    for i, (word, ipa_phonemes) in enumerate(sample_phonemes):
        arpabet_phonemes = arpabet_results[i][1]
        display_phoneme_comparison(word, ipa_phonemes, arpabet_phonemes)
else:
    print("Transcript not available. Please run the audio processing cells first.")

print(sample_phonemes)

Original transcript: How are you? What are you doing? I would like
Number of words: 10
------------------------------------------------------------
 1. Word: 'how' -> Phonemes: [ h ˈaʊ]
 2. Word: 'are' -> Phonemes: [ ə]
 3. Word: 'you' -> Phonemes: [ j ə]
 4. Word: 'what' -> Phonemes: [ w ˈɒ t]
 5. Word: 'are' -> Phonemes: [ ə]
 6. Word: 'you' -> Phonemes: [ j ə]
 7. Word: 'doing' -> Phonemes: [ d ˈuː ɪ ŋ]
 8. Word: 'i' -> Phonemes: [ ˈaɪ]
 9. Word: 'would' -> Phonemes: [ w ˈʊ d]
10. Word: 'like' -> Phonemes: [ l ˈaɪ k]
------------------------------------------------------------
Complete phoneme sequence:
 h ˈaʊ |  ə |  j ə |  w ˈɒ t |  ə |  j ə |  d ˈuː ɪ ŋ |  ˈaɪ |  w ˈʊ d |  l ˈaɪ k
Word: 'how'
  IPA:     [ h ˈaʊ]
  ARPABET: [  HH   AW]
----------------------------------------
Word: 'are'
  IPA:     [ ə]
  ARPABET: [  AH]
----------------------------------------
Word: 'you'
  IPA:     [ j ə]
  ARPABET: [  Y   AH]
----------------------------------------
Word: 'what'
  IPA:     [ w 