Install all of the libraries required

In [None]:

import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import subprocess
import tempfile
import shutil
import json
import numpy as np
import librosa
import matplotlib.pyplot as plt
import librosa.display
import torch
import torch.nn as nn
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
from pydub import AudioSegment
import soundfile as sf
from textgrid import TextGrid
from librosa.feature import mfcc as librosa_mfcc
import scipy.spatial
import scipy.signal
import wave
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio


The montreal forced aligner expects a .WAV audio file that is single channel (mono) and 16kHz sample rate. MP3 files from development and WEBM from the Front-End need to be converted.

In [None]:
# ------------------ Audio Preprocessing ------------------

def convert_mp3_to_wav(input_file: str, output_file: str) -> str:
    audio = AudioSegment.from_mp3(input_file)
    audio = audio.set_frame_rate(16000).set_channels(1)
    audio.export(output_file, format="wav")
    return output_file

def convert_webm_to_wav(input_file: str, output_file: str) -> str:
    """Convert WebM audio to WAV with robust error handling and fallbacks."""

    
    # Verify input file
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")
    
    if os.path.getsize(input_file) == 0:
        raise ValueError(f"Input file is empty: {input_file}")
    
    try:
        try:
            print(f"Attempting to convert {input_file} with pydub...")
            audio = AudioSegment.from_file(input_file, format="webm")
            audio = audio.set_frame_rate(16000).set_channels(1)
            audio.export(output_file, format="wav")
            print("Pydub conversion successful")
            return output_file
        except Exception as e:
            print(f"Pydub conversion failed: {e}")
        
        print("Attempting direct FFmpeg conversion...")
        try:
            cmd = [
                "ffmpeg", "-y", 
                "-i", input_file, 
                "-ar", "16000", 
                "-ac", "1",
                "-vn",  
                "-acodec", "pcm_s16le",  
                output_file
            ]
            result = subprocess.run(
                cmd, 
                stderr=subprocess.PIPE, 
                stdout=subprocess.PIPE,
                text=True
            )
            if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
                print("FFmpeg conversion successful")
                return output_file
            else:
                print(f"FFmpeg conversion produced empty file: {result.stderr}")
        except Exception as e:
            print(f"FFmpeg command failed: {e}")
        
        print("Attempting FFmpeg conversion with explicit format...")
        try:
            cmd = [
                "ffmpeg", "-y", 
                "-f", "webm",
                "-i", input_file,
                "-ar", "16000", 
                "-ac", "1",
                "-acodec", "pcm_s16le",
                output_file
            ]
            result = subprocess.run(
                cmd, 
                stderr=subprocess.PIPE, 
                stdout=subprocess.PIPE,
                text=True
            )
            if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
                print("FFmpeg explicit format conversion successful")
                return output_file
            else:
                print(f"FFmpeg explicit format produced empty file: {result.stderr}")
        except Exception as e:
            print(f"FFmpeg explicit format command failed: {e}")
        
        print("Creating minimal valid WAV file as fallback")
        with wave.open(output_file, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)  
            wf.setframerate(16000)
            wf.writeframes(b'\x00' * 32000) 
        
        return output_file
    
    except Exception as e:
        print(f"Fatal error in webm to wav conversion: {e}")
        raise

Montreal forced aligner can easily be ran in the terminal once installed, and when provided a corpus (folder containing .WAV files and a .TXT file with the transcript) it will output a folder with the aligned phonemes in .TextGrid format.

The command I've been running in the miniconda terminal until making the process callable in python is:
    
mfa align --clean ./input_english ./model/pretrained_models/dictionary/english_us_arpa.dict english_us_arpa ./output_english

./input_english --> folder containing the .WAV files and .TXT file with the transcript

./model/pretrained_models/dictionary/english_us_arpa.dict --> path to the dictionary file

english_us_arpa --> name of the model to use (in this case, no relative path was needed as MFA knows my environment variable folder)

./output_english --> folder where the aligned phonemes will be saved in .TextGrid format

In [None]:
# ------------------ Forced Alignment ------------------

def run_mfa_alignment(wav_path: str, transcript: str) -> str:
    import time

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    timestamp = str(int(time.time()))
    base_dir = os.path.join(".", "audio", timestamp) 
    
    corpus_dir = os.path.join(base_dir, timestamp + "-corpus")
    os.makedirs(corpus_dir, exist_ok=True)

    shutil.copy(wav_path, os.path.join(corpus_dir, os.path.basename(wav_path)))
    shutil.copy(transcript, os.path.join(corpus_dir, os.path.basename(transcript)))

    aligned_output_dir = base_dir

    lexicon_path = "model/pretrained_models/dictionary/english_us_arpa.dict"

    mfa_align_command = [
        "mfa", "align", "--clean",
        corpus_dir,
        lexicon_path,
        "english_us_arpa",
        aligned_output_dir,
        "--debug",
    ]
    try:
        subprocess.run(
            mfa_align_command,
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8',
            errors='replace'
        )
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Error running MFA alignment: {e}")

    textgrid_path = os.path.join(aligned_output_dir, os.path.splitext(os.path.basename(wav_path))[0] + ".TextGrid")
    if not os.path.exists(textgrid_path):
        raise FileNotFoundError(f"Expected TextGrid file not found: {textgrid_path}")

    return textgrid_path

The Pronunciation Scoring Model analyzes phoneme segments from audio recordings and calculates quality scores using acoustic features such as energy, spectral characteristics, and duration. This is what Montreal Forced Aligner captures from audio.

In [None]:
# ------------------ Pronunciation Scoring Model ------------------

def score_phonemes_with_mfa(audio_path: str, phoneme_intervals: list) -> list:
    """Score phonemes using MFA alignment metrics and acoustic features.
       Enhanced to better differentiate native vs. non-native pronunciation."""
    
    waveform, sr = librosa.load(audio_path, sr=16000)
    
    
    vowels = ['AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW']
    stops = ['P', 'B', 'T', 'D', 'K', 'G']
    fricatives = ['F', 'V', 'TH', 'DH', 'S', 'Z', 'SH', 'ZH', 'HH']
    
    results = []
    for interval in phoneme_intervals:
        phoneme = interval['phoneme']
        start, end = interval['start'], interval['end']
        start_idx, end_idx = int(start * sr), int(end * sr)
        
        if start_idx >= end_idx or end_idx > len(waveform):
            continue
            
        segment = waveform[start_idx:end_idx]
        if len(segment) == 0:
            continue
        
        energy = np.mean(segment**2)
        
        spec = np.abs(librosa.stft(segment))
        if spec.shape[1] > 1: 
            flux = np.mean(np.diff(spec, axis=1)**2)
        else:
            flux = 0
            
        
        if phoneme in vowels and len(segment) > sr * 0.03:  
            S = np.abs(librosa.stft(segment))
            freqs = librosa.fft_frequencies(sr=sr)
            if S.shape[1] > 0:
              
                frame_peaks = []
                for frame in range(S.shape[1]):
                    spectrum = S[:, frame]
                    peaks, _ = scipy.signal.find_peaks(spectrum, height=np.max(spectrum)*0.1)
                    if len(peaks) >= 2:
                        formant_freqs = freqs[peaks]
                        frame_peaks.append(formant_freqs[:3] if len(formant_freqs) >= 3 else formant_freqs)
                
        
       
        duration_score = 0.5 
        
        if phoneme in vowels:
            
                spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)[0].mean()
                zero_crossing_rate = librosa.feature.zero_crossing_rate(segment)[0].mean()
                
                final_score = (
                    0.4 * duration_score + 
                    0.3 * min(1.0, energy / 0.1) + 
                    0.3 * spectral_centroid / 4000  
                )
            
        elif phoneme in stops:
            burst_dur = min(int(0.03 * sr), len(segment))
            if burst_dur > 0:
                burst_segment = segment[:burst_dur]
                burst_energy = np.mean(burst_segment**2)
                
                burst_score = min(1.0, burst_energy / 0.2)  
                
                final_score = (
                    0.4 * duration_score + 
                    0.4 * burst_score + 
                    0.2 * min(1.0, flux * 10)  
                )
            else:
                final_score = duration_score
                
        elif phoneme in fricatives:
            zero_crossing_rate = librosa.feature.zero_crossing_rate(segment)[0].mean()
            spectral_flatness = librosa.feature.spectral_flatness(y=segment)[0].mean()
            
            zcr_score = min(1.0, zero_crossing_rate / 0.2) 
            
            final_score = (
                0.3 * duration_score + 
                0.4 * zcr_score + 
                0.3 * spectral_flatness
            )
            
        else:
           
            spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)[0].mean()
            zero_crossing_rate = librosa.feature.zero_crossing_rate(segment)[0].mean()
            
  
            final_score = (
                0.4 * duration_score + 
                0.3 * min(1.0, energy / 0.1) + 
                0.3 * (1.0 - min(1.0, abs(0.1 - zero_crossing_rate) / 0.1))
            )
        
        alpha = 7.0  
        beta = 0.5  
        
        # Apply sigmoid transformation to spread scores out more
        final_score = 1.0 / (1.0 + np.exp(-alpha * (final_score - beta)))
        
        # Add small random component to prevent identical scores (0.01 max variance)
        final_score += np.random.uniform(-0.01, 0.01)
        
        # Ensure score is within [0,1]
        final_score = min(max(final_score, 0.0), 1.0)
        
        if final_score < 0.4:
            grade = 'poor'
        elif final_score < 0.55:
            grade = 'borderline'
        elif final_score < 0.7:
            grade = 'good'
        elif final_score < 0.85:
            grade = 'very good'
        else: 
            grade = 'excellent'
            
        tip = tips.get(ipa_map.get(phoneme, phoneme), '') if grade in ['poor', 'borderline', 'good'] else ''
        
        results.append({
            'phoneme': phoneme,
            'start': start,
            'end': end,
            'score': float(final_score),
            'grade': grade,
            'tip': tip
        })
    
    return results

IPA and tips for providing more contextual feedback to the user. 
This let's the user know which sounds could be improved and how to do so.
The IPA is a phonetic transcription system that provides a standardized way to represent the sounds of spoken language. It uses a set of symbols to represent each sound, allowing for precise and consistent representation of pronunciation.

In [None]:
# ------------------ Feedback Generation ------------------

# ARPABET to IPA mapping
ipa_map = {
    # Stops
    "P":  "p",  "B":  "b",
    "T":  "t",  "D":  "d",
    "K":  "k",  "G":  "g",

    # Affricates
    "CH": "tʃ", "JH": "dʒ",

    # Fricatives
    "F":  "f",  "V":  "v",
    "TH": "θ", "DH": "ð",
    "S":  "s",  "Z":  "z",
    "SH": "ʃ", "ZH": "ʒ",
    "HH": "h",

    # Nasals
    "M":  "m",  "N":  "n",  "NG": "ŋ",

    # Liquids & glides
    "L":  "l",  "R":  "ɹ",
    "W":  "w",  "Y":  "j",

    # Vowels (monophthongs)
    "AA": "ɑ",  "AE": "æ",
    "AH": "ʌ",  "AO": "ɔ",
    "AW": "aʊ", "AY": "aɪ",
    "EH": "ɛ",  "ER": "ɝ",
    "EY": "eɪ", "IH": "ɪ",
    "IY": "i",  "OW": "oʊ",
    "OY": "ɔɪ", "UH": "ʊ",
    "UW": "u",  "AX": "ə",  # schwa

    # Secondary stress or extra symbols
    "AXR": "ɚ",  # r-colored schwa
}

# Learner-friendly pronunciation tips
tips = {
    # Stops
    "p":  "Close both lips then release with a small burst. Examples: 'pen', 'cup'. Common error: Aspirating too strongly—keep it light.",
    "b":  "Close both lips and voice the release. Examples: 'bat', 'rub'. Common error: Voicing too softly—feel the vibration in your throat.",
    "t":  "Place tongue tip behind your upper teeth ridge, then release. Examples: 'top', 'cat'. Common error: Using too much aspiration—release gently.",
    "d":  "Place tongue tip behind the ridge and voice on release. Examples: 'dog', 'mad'. Common error: Dropping the tongue—keep contact until release.",
    "k":  "Raise the back of your tongue to the soft palate, then release. Examples: 'key', 'back'. Common error: Not releasing fully—feel the puff of air.",
    "g":  "Raise back of tongue, then voice on release. Examples: 'go', 'bag'. Common error: G-sound too soft—ensure vocal cords vibrate.",

    # Affricates
    "tʃ": "Start with /t/ then move into /ʃ/ in one smooth motion. Examples: 'chair', 'match'. Common error: Separating sounds—blend them.",
    "dʒ": "Start with /d/ then move into /ʒ/. Examples: 'judge', 'edge'. Common error: Leaving out the /ʒ/—feel the vibration in your throat.",

    # Fricatives
    "f":  "Touch bottom lip to upper teeth and blow. Examples: 'fan', 'life'. Common error: Voicing it—keep it voiceless.",
    "v":  "Touch bottom lip to upper teeth and voice. Examples: 'very', 'love'. Common error: Making it /w/—feel the vibration.",
    "θ": "Place tongue between teeth and blow. Examples: 'think', 'bath'. Common error: Saying /f/—feel the air between teeth.",
    "ð": "Place tongue between teeth and voice. Examples: 'this', 'breathe'. Common error: Saying /d/—look for tongue air.",
    "s":  "Place tongue close to ridge and blow. Examples: 'see', 'bus'. Common error: Rounding lips—keep them spread.",
    "z":  "Same as /s/ but voice. Examples: 'zoo', 'lazy'. Common error: Leaving out voice—feel the buzz.",
    "ʃ": "Round lips and raise tongue middle. Examples: 'ship', 'nation'. Common error: Saying /s/—protrude lips.",
    "ʒ": "Same as /ʃ/ but voice. Examples: 'measure', 'beige'. Common error: Devoicing—place fingers on throat.",
    "h":  "Open mouth slightly and exhale. Examples: 'hat', 'ahead'. Common error: Too forceful—keep it breathy.",

    # Nasals
    "m":  "Close lips and voice through nose. Examples: 'man', 'home'. Common error: Oral release—keep velum lowered.",
    "n":  "Tongue tip on ridge and voice through nose. Examples: 'no', 'ten'. Common error: Making it /d/—feel nasal buzz.",
    "ŋ": "Back tongue on soft palate and voice. Examples: 'sing', 'ring'. Common error: Adding /g/—hold tongue position.",

    # Liquids & glides
    "l":  "Tongue tip on ridge and voice. Examples: 'light', 'feel'. Common error: Velarized /l/ everywhere—use light /l/ initially.",
    "ɹ": "Curl tongue tip back without touching roof. Examples: 'red', 'sorry'. Common error: Rolling—keep it smooth.",
    "w":  "Round lips and voice. Examples: 'water', 'away'. Common error: Not rounding—pucker your lips.",
    "j":  "Raise tongue close to palate and glide. Examples: 'yes', 'beyond'. Common error: Too consonant—make it smooth.",

    # Vowels
    "i":  "Spread lips and raise tongue front-high. Examples: 'see', 'beat'. Common error: Relaxing tongue—keep it tense.",
    "ɪ": "Slightly lower and relax from /i/. Examples: 'sit', 'hid'. Common error: Stretching—keep it short.",
    "eɪ": "Start at /e/ then glide to /i/. Examples: 'say', 'they'. Common error: Not finishing glide—move to /i/.",
    "ɛ": "Lower tongue from /ɪ/. Examples: 'bed', 'head'. Common error: Closing too much—open jaw more.",
    "æ": "Open mouth wide, tongue low front. Examples: 'cat', 'hand'. Common error: Too narrow—drop jaw further.",
    "ɑ": "Open mouth wide, tongue low back. Examples: 'father', 'spa'. Common error: Raising tongue—keep it flat.",
    "ʌ": "Tongue mid, slightly back. Examples: 'cup', 'luck'. Common error: Confusing with /ə/—make it stronger.",
    "ɔ": "Round lips, tongue mid-back. Examples: 'thought', 'law'. Common error: Using /ɑ/—round lips more.",
    "oʊ": "Start /o/ then glide to /ʊ/. Examples: 'go', 'show'. Common error: Skipping glide—finish at /ʊ/.",
    "ʊ": "Relaxed /u/. Examples: 'book', 'could'. Common error: Stretching to /u/—keep it short.",
    "u":  "Round lips tightly, tongue high back. Examples: 'food', 'blue'. Common error: Not rounding—protrude lips.",
    "ə":  "Neutral schwa. Examples: 'about', 'sofa'. Common error: Emphasizing—make it very brief.",
    "ɝ": "R-colored schwa. Examples: 'her', 'bird'. Common error: Dropping /r/—curl tongue lightly.",

    # Diphthongs
    "aɪ": "Start /a/ then glide to /ɪ/. Examples: 'time', 'kite'. Common error: Too quick—complete the glide.",
    "aʊ": "Start /a/ then glide to /ʊ/. Examples: 'house', 'now'. Common error: Missing lip rounding—round at end.",
    "ɔɪ": "Start /ɔ/ then glide to /ɪ/. Examples: 'boy', 'toy'. Common error: Abrupt change—make it smooth."
}


This section contains functions for analyzing pronunciation at the word level:

extract_word_intervals: Extracts word timing information from TextGrid files, creating a list of word segments with start/end times.

map_phonemes_to_words: Associates each phoneme with its corresponding word by checking if the phoneme falls within the word's time interval.

compute_word_scores_and_feedback: Generates per-word feedback by averaging phoneme scores and identifying problematic sounds. For each word, it finds the lowest-scoring phonemes and provides targeted improvement tips using the IPA listed above. Later, they'll be able to find the IPA in the generated plot too.

The functions utilize the TextGrid files which might look like this:

```
File type = "ooTextFile"
Object class = "TextGrid"

xmin = 0 
xmax = 30.421312 
tiers? <exists> 
size = 2 
item []: 
    item [1]:
        class = "IntervalTier" 
        name = "words" 
        xmin = 0 
        xmax = 30.421312 
        intervals: size = 34 
        intervals [1]:
            xmin = 0.0 
            xmax = 3.09 
            text = "" 
        intervals [2]:
            xmin = 3.09 
            xmax = 3.32 
            text = "a" 
        intervals [3]:
            xmin = 3.32 
            xmax = 4.08 
            text = "girl" 
        intervals [4]:
            xmin = 4.08 
            xmax = 4.91 
            text = "" 
        intervals [5]:
            xmin = 4.91 
            xmax = 6.11 
            text = "planted" 
        intervals [6]:
            xmin = 6.11 
            xmax = 7.18 
            text = "" 
        intervals [7]:
            xmin = 7.18 
            xmax = 8.05 
            text = "a" 
        intervals [8]:
            xmin = 8.05 
            xmax = 8.43 
            text = "" 
        intervals [9]:
            xmin = 8.43 
            xmax = 9.83 
            text = "single" 
        intervals [10]:
            xmin = 9.83 
            xmax = 10.05 
            text = "seed" 
        intervals [11]:
            xmin = 10.05 
            xmax = 10.63 
            text = "in" 
        intervals [12]:
            xmin = 10.63 
            xmax = 10.99 
            text = "" 
        intervals [13]:
            xmin = 10.99 
            xmax = 11.65 
            text = "her" 
        intervals [14]:
            xmin = 11.65 
            xmax = 12.31 
            text = "barren" 
        intervals [15]:
            xmin = 12.31 
            xmax = 12.43 
            text = "" 
        intervals [16]:
            xmin = 12.43 
            xmax = 13.02 
            text = "garden" 
        intervals [17]:
            xmin = 13.02 
            xmax = 14.23 
            text = "" 
        intervals [18]:
            xmin = 14.23 
            xmax = 14.85 
            text = "by" 
        intervals [19]:
            xmin = 14.85 
            xmax = 16.34 
            text = "morning" 
        intervals [20]:
            xmin = 16.34 
            xmax = 16.37 
            text = "" 
        intervals [21]:
            xmin = 16.37 
            xmax = 17.55 
            text = "it" 
        intervals [22]:
            xmin = 17.55 
            xmax = 17.79 
            text = "" 
        intervals [23]:
            xmin = 17.79 
            xmax = 17.86 
            text = "had" 
        intervals [24]:
            xmin = 17.86 
            xmax = 18.06 
            text = "grown" 
        intervals [25]:
            xmin = 18.06 
            xmax = 18.27 
            text = "into" 
        intervals [26]:
            xmin = 18.27 
            xmax = 18.35 
            text = "a" 
        intervals [27]:
            xmin = 18.35 
            xmax = 18.93 
            text = "tree" 
        intervals [28]:
            xmin = 18.93 
            xmax = 20.76 
            text = "" 
        intervals [29]:
            xmin = 20.76 
            xmax = 21.18 
            text = "filled" 
        intervals [30]:
            xmin = 21.18 
            xmax = 21.4 
            text = "with" 
        intervals [31]:
            xmin = 21.4 
            xmax = 21.88 
            text = "golden" 
        intervals [32]:
            xmin = 21.88 
            xmax = 21.95 
            text = "" 
        intervals [33]:
            xmin = 21.95 
            xmax = 22.69 
            text = "fruit" 
        intervals [34]:
            xmin = 22.69 
            xmax = 30.421312 
            text = "" 
    item [2]:
        class = "IntervalTier" 
        name = "phones" 
        xmin = 0 
        xmax = 30.421312 
        intervals: size = 89 
        intervals [1]:
            xmin = 0.0 
            xmax = 3.09 
            text = "" 
        intervals [2]:
            xmin = 3.09 
            xmax = 3.32 
            text = "AH0" 
        intervals [3]:
            xmin = 3.32 
            xmax = 3.38 
            text = "G" 
        intervals [4]:
            xmin = 3.38 
            xmax = 3.95 
            text = "ER1" 
        intervals [5]:
            xmin = 3.95 
            xmax = 4.08 
            text = "L" 
        intervals [6]:
            xmin = 4.08 
            xmax = 4.91 
            text = "" 
        intervals [7]:
            xmin = 4.91 
            xmax = 4.94 
            text = "P" 
        intervals [8]:
            xmin = 4.94 
            xmax = 4.97 
            text = "L" 
        intervals [9]:
            xmin = 4.97 
            xmax = 5.71 
            text = "AE1" 
        intervals [10]:
            xmin = 5.71 
            xmax = 5.74 
            text = "N" 
        intervals [11]:
            xmin = 5.74 
            xmax = 6.04 
            text = "AH0" 
        intervals [12]:
            xmin = 6.04 
            xmax = 6.11 
            text = "D" 
        intervals [13]:
            xmin = 6.11 
            xmax = 7.18 
            text = "" 
        intervals [14]:
            xmin = 7.18 
            xmax = 8.05 
            text = "AH0" 
        intervals [15]:
            xmin = 8.05 
            xmax = 8.43 
            text = "" 
        intervals [16]:
            xmin = 8.43 
            xmax = 9.34 
            text = "S" 
        intervals [17]:
            xmin = 9.34 
            xmax = 9.59 
            text = "IH1" 
        intervals [18]:
            xmin = 9.59 
            xmax = 9.62 
            text = "NG" 
        intervals [19]:
            xmin = 9.62 
            xmax = 9.65 
            text = "G" 
        intervals [20]:
            xmin = 9.65 
            xmax = 9.77 
            text = "AH0" 
        intervals [21]:
            xmin = 9.77 
            xmax = 9.83 
            text = "L" 
        intervals [22]:
            xmin = 9.83 
            xmax = 9.99 
            text = "S" 
        intervals [23]:
            xmin = 9.99 
            xmax = 10.02 
            text = "IY1" 
        intervals [24]:
            xmin = 10.02 
            xmax = 10.05 
            text = "D" 
        intervals [25]:
            xmin = 10.05 
            xmax = 10.59 
            text = "IH1" 
        intervals [26]:
            xmin = 10.59 
            xmax = 10.63 
            text = "N" 
        intervals [27]:
            xmin = 10.63 
            xmax = 10.99 
            text = "" 
        intervals [28]:
            xmin = 10.99 
            xmax = 11.02 
            text = "HH" 
        intervals [29]:
            xmin = 11.02 
            xmax = 11.65 
            text = "ER1" 
        intervals [30]:
            xmin = 11.65 
            xmax = 11.75 
            text = "B" 
        intervals [31]:
            xmin = 11.75 
            xmax = 11.87 
            text = "EH1" 
        intervals [32]:
            xmin = 11.87 
            xmax = 11.99 
            text = "R" 
        intervals [33]:
            xmin = 11.99 
            xmax = 12.11 
            text = "AH0" 
        intervals [34]:
            xmin = 12.11 
            xmax = 12.31 
            text = "N" 
        intervals [35]:
            xmin = 12.31 
            xmax = 12.43 
            text = "" 
        intervals [36]:
            xmin = 12.43 
            xmax = 12.48 
            text = "G" 
        intervals [37]:
            xmin = 12.48 
            xmax = 12.56 
            text = "AA1" 
        intervals [38]:
            xmin = 12.56 
            xmax = 12.7 
            text = "R" 
        intervals [39]:
            xmin = 12.7 
            xmax = 12.76 
            text = "D" 
        intervals [40]:
            xmin = 12.76 
            xmax = 12.94 
            text = "AH0" 
        intervals [41]:
            xmin = 12.94 
            xmax = 13.02 
            text = "N" 
        intervals [42]:
            xmin = 13.02 
            xmax = 14.23 
            text = "" 
        intervals [43]:
            xmin = 14.23 
            xmax = 14.39 
            text = "B" 
        intervals [44]:
            xmin = 14.39 
            xmax = 14.85 
            text = "AY1" 
        intervals [45]:
            xmin = 14.85 
            xmax = 14.88 
            text = "M" 
        intervals [46]:
            xmin = 14.88 
            xmax = 15.51 
            text = "AO1" 
        intervals [47]:
            xmin = 15.51 
            xmax = 15.67 
            text = "R" 
        intervals [48]:
            xmin = 15.67 
            xmax = 15.76 
            text = "N" 
        intervals [49]:
            xmin = 15.76 
            xmax = 16.09 
            text = "IH0" 
        intervals [50]:
            xmin = 16.09 
            xmax = 16.34 
            text = "NG" 
        intervals [51]:
            xmin = 16.34 
            xmax = 16.37 
            text = "" 
        intervals [52]:
            xmin = 16.37 
            xmax = 17.52 
            text = "IH0" 
        intervals [53]:
            xmin = 17.52 
            xmax = 17.55 
            text = "T" 
        intervals [54]:
            xmin = 17.55 
            xmax = 17.79 
            text = "" 
        intervals [55]:
            xmin = 17.79 
            xmax = 17.82 
            text = "HH" 
        intervals [56]:
            xmin = 17.82 
            xmax = 17.83 
            text = "AE1" 
        intervals [57]:
            xmin = 17.83 
            xmax = 17.86 
            text = "D" 
        intervals [58]:
            xmin = 17.86 
            xmax = 17.89 
            text = "G" 
        intervals [59]:
            xmin = 17.89 
            xmax = 17.97 
            text = "R" 
        intervals [60]:
            xmin = 17.97 
            xmax = 18.01 
            text = "OW1" 
        intervals [61]:
            xmin = 18.01 
            xmax = 18.06 
            text = "N" 
        intervals [62]:
            xmin = 18.06 
            xmax = 18.11 
            text = "IH0" 
        intervals [63]:
            xmin = 18.11 
            xmax = 18.17 
            text = "N" 
        intervals [64]:
            xmin = 18.17 
            xmax = 18.21 
            text = "T" 
        intervals [65]:
            xmin = 18.21 
            xmax = 18.27 
            text = "AH0" 
        intervals [66]:
            xmin = 18.27 
            xmax = 18.35 
            text = "AH0" 
        intervals [67]:
            xmin = 18.35 
            xmax = 18.38 
            text = "T" 
        intervals [68]:
            xmin = 18.38 
            xmax = 18.41 
            text = "R" 
        intervals [69]:
            xmin = 18.41 
            xmax = 18.93 
            text = "IY1" 
        intervals [70]:
            xmin = 18.93 
            xmax = 20.76 
            text = "" 
        intervals [71]:
            xmin = 20.76 
            xmax = 20.79 
            text = "F" 
        intervals [72]:
            xmin = 20.79 
            xmax = 20.91 
            text = "IH1" 
        intervals [73]:
            xmin = 20.91 
            xmax = 20.95 
            text = "L" 
        intervals [74]:
            xmin = 20.95 
            xmax = 21.18 
            text = "D" 
        intervals [75]:
            xmin = 21.18 
            xmax = 21.21 
            text = "W" 
        intervals [76]:
            xmin = 21.21 
            xmax = 21.24 
            text = "IH0" 
        intervals [77]:
            xmin = 21.24 
            xmax = 21.4 
            text = "DH" 
        intervals [78]:
            xmin = 21.4 
            xmax = 21.43 
            text = "G" 
        intervals [79]:
            xmin = 21.43 
            xmax = 21.56 
            text = "OW1" 
        intervals [80]:
            xmin = 21.56 
            xmax = 21.79 
            text = "L" 
        intervals [81]:
            xmin = 21.79 
            xmax = 21.82 
            text = "D" 
        intervals [82]:
            xmin = 21.82 
            xmax = 21.85 
            text = "AH0" 
        intervals [83]:
            xmin = 21.85 
            xmax = 21.88 
            text = "N" 
        intervals [84]:
            xmin = 21.88 
            xmax = 21.95 
            text = "" 
        intervals [85]:
            xmin = 21.95 
            xmax = 22.37 
            text = "F" 
        intervals [86]:
            xmin = 22.37 
            xmax = 22.42 
            text = "R" 
        intervals [87]:
            xmin = 22.42 
            xmax = 22.65 
            text = "UW1" 
        intervals [88]:
            xmin = 22.65 
            xmax = 22.69 
            text = "T" 
        intervals [89]:
            xmin = 22.69 
            xmax = 30.421312 
            text = "" 


In [None]:

def extract_word_intervals(tg_path):
    tg = TextGrid()
    tg.read(tg_path)
    
    word_tier = next((t for t in tg.tiers if t.name.lower() in ('words', 'word')), None)
    if word_tier is None:
        raise ValueError("No 'words' tier found in TextGrid.")
    
    words = [
        {'word': iv.mark.strip(), 'start': iv.minTime, 'end': iv.maxTime}
        for iv in word_tier.intervals if iv.mark.strip()
    ]
    return words


def map_phonemes_to_words(phoneme_results, word_intervals):
    word_map = {w['word']: [] for w in word_intervals}
    for phon in phoneme_results:
        mid = (phon['start'] + phon['end']) / 2.0
        for w in word_intervals:
            if w['start'] <= mid <= w['end']:
                word_map[w['word']].append(phon)
                break
    return word_map


def compute_word_scores_and_feedback(word_map, ipa_map, tips):
    feedback_list = []
    for word, phons in word_map.items():
        if not phons:
            continue
        scores = [p['score'] for p in phons]
        avg_score = float(np.mean(scores))
        worst = sorted(phons, key=lambda p: p['score'])[:2]
        sentences = []
        for p in worst:
            arp = p['phoneme']
            ipa = ipa_map.get(arp, arp)
            tip = tips.get(ipa, '')
            sentences.append(
                f"Your /{ipa}/ sound in '{word}' scored {p['score']:.2f}. {tip}"
            )
        feedback_list.append({
            'word': word,
            'avg_score': avg_score,
            'feedback': sentences
        })
    feedback_list.sort(key=lambda x: x['avg_score'])
    return feedback_list

An easy to understand plot that's displated to the user in the front end. It consists of color graded pillars for each phenome they've pronounced for the entire sentence. The phenomes are written on top of the pillars, so the user can review the expected vs. actual pronunciation. 

In [None]:
# ------------------- Plots -------------------

def create_phoneme_timeline(results, base_dir):
    """Create a color-coded timeline visualization of phoneme scores"""
    plt.figure(figsize=(12, 4))
    
    colors = {
        'poor': 'red',
        'borderline': 'orange',
        'good': 'yellow',
        'very good': 'lightgreen',
        'excellent': 'green'
    }
    
    for i, phoneme in enumerate(results):
        plt.barh(0, phoneme['end'] - phoneme['start'], left=phoneme['start'], 
                height=0.5, color=colors[phoneme['grade']], alpha=0.7)
        
        text_x = phoneme['start'] + (phoneme['end'] - phoneme['start'])/2
        plt.text(text_x, 0, phoneme['phoneme'], ha='center', va='center', fontweight='bold')
    
    handles = [plt.Rectangle((0,0),1,1, color=colors[grade]) for grade in colors]
    plt.legend(handles, colors.keys(), loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=4)
    
    plt.yticks([])
    plt.xlabel('Time (seconds)')
    plt.title('Pronunciation Quality Timeline')
    
    # Save the figure
    timeline_path = os.path.join(os.path.dirname(base_dir), "phoneme_timeline.png")
    plt.savefig(timeline_path)
    plt.close()
    
    return timeline_path

Set up code for the pretrained facebook model (Wav2Vec2)

In [None]:
def initialize_pretrained_model():
    print("Initializing pretrained pronunciation model...")
    
    model_name = "facebook/wav2vec2-large-960h-lv60-self" 
    
    try:
        processor = Wav2Vec2Processor.from_pretrained(model_name)
        model = Wav2Vec2ForCTC.from_pretrained(model_name)
        print(f"Successfully loaded {model_name}")
        return processor, model
    except Exception as e:
        print(f"Error loading pretrained model: {e}")
        return None, None

wav2vec_processor = None
wav2vec_model = None

def init_models():
    global wav2vec_processor, wav2vec_model
    if wav2vec_processor is None or wav2vec_model is None:
        wav2vec_processor, wav2vec_model = initialize_pretrained_model()

In [None]:
def enhanced_score_phonemes(audio_path, phoneme_intervals):
    """
    Enhanced scoring that combines rule-based features with pretrained model confidence
    
    Parameters:
    - audio_path: Path to the audio file
    - phoneme_intervals: List of phoneme intervals from MFA
    - reference_intervals: Optional reference intervals
    
    Returns:
    - List of phoneme scores with enhanced scoring
    """
    # Initialize models if needed
    init_models()
    
    # First get the base scores using your existing function
    base_scores = score_phonemes_with_mfa(audio_path, phoneme_intervals)
    
    # If model loading failed, return base scores
    if wav2vec_processor is None or wav2vec_model is None:
        print("Warning: Using only rule-based scoring as pretrained model failed to load")
        return base_scores
    
    try:
        # Load and resample audio
        waveform, sample_rate = torchaudio.load(audio_path)
        # Resample to 16kHz if needed
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
        
        # Convert to mono if needed
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Process through wav2vec
        with torch.no_grad():
            # Get model's features
            inputs = wav2vec_processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
            with torch.no_grad():
                # Get logits (pre-softmax outputs)
                outputs = wav2vec_model(**inputs)
                logits = outputs.logits
                
            # Get predicted probabilities
            probs = torch.nn.functional.softmax(logits, dim=-1)
            
            # Get confidence scores for each timeframe
            confidences = torch.max(probs, dim=-1)[0]
            
            # Convert to numpy
            confidence_values = confidences.squeeze().numpy()
            
            # Calculate frame rate for alignment
            frames_per_second = len(confidence_values) / (waveform.shape[1] / 16000)
        
        # Enhanced scores with pretrained model confidence
        enhanced_results = []
        for i, phoneme in enumerate(base_scores):
            # Extract start and end frame indices
            frame_start = int(phoneme['start'] * frames_per_second)
            frame_end = int(phoneme['end'] * frames_per_second)
            
            # Ensure frame indices are valid
            frame_start = max(0, frame_start)
            frame_end = min(len(confidence_values) - 1, frame_end)
            
            if frame_start < frame_end:
                # Calculate mean confidence for this phoneme
                phoneme_confidence = np.mean(confidence_values[frame_start:frame_end])
                
                # Combine rule-based score with model confidence
                # Weight: 60% rule-based, 40% model confidence
                enhanced_score = 0.4 * phoneme['score'] + 0.6 * phoneme_confidence
                
                # Ensure score is in [0,1] range
                enhanced_score = min(max(enhanced_score, 0.0), 1.0)
                
                # Update grade based on enhanced score
                if enhanced_score < 0.4:
                    grade = 'poor'
                elif enhanced_score < 0.55:  # 0.4 + 0.15
                    grade = 'borderline'
                elif enhanced_score < 0.7:   # 0.55 + 0.15
                    grade = 'good'
                elif enhanced_score < 0.85:  # 0.7 + 0.15
                    grade = 'very good'
                else:
                    grade = 'excellent'
                
                # Create enhanced result
                enhanced_result = phoneme.copy()
                enhanced_result['score'] = float(enhanced_score)
                enhanced_result['grade'] = grade
                enhanced_result['confidence'] = float(phoneme_confidence)
            else:
                enhanced_result = phoneme.copy()
            
            enhanced_results.append(enhanced_result)
        
        return enhanced_results
    
    except Exception as e:
        print(f"Error in enhanced scoring: {e}")
        return base_scores

Final function tying all the previous functions together. It takes the audio file, the TextGrid file, and the dictionary as inputs. It runs the Montreal Forced Aligner to align the audio with the transcript, extracts phoneme intervals, and computes scores and feedback for each phoneme.

In [None]:

def process_audio_enhanced(wav_path: str, transcript: str, base_dir = None):
    """Enhanced version of process_audio that uses the pretrained model for scoring"""
    import matplotlib
    matplotlib.use('Agg')

    tg_path = run_mfa_alignment(wav_path, transcript)
    tg = TextGrid()
    tg.read(tg_path)

    phoneme_tier = next((t for t in tg.tiers if t.name.lower() == 'phones'), None)

    # Extract phoneme intervals
    phoneme_intervals = [
        {'phoneme': interval.mark.strip(), 'start': interval.minTime, 'end': interval.maxTime}
        for interval in phoneme_tier.intervals if interval.mark.strip()
    ]

    
    results = enhanced_score_phonemes(wav_path, phoneme_intervals)
    
    word_interval = extract_word_intervals(tg_path)
    word_map = map_phonemes_to_words(results, word_interval)
    word_feedback = compute_word_scores_and_feedback(word_map, ipa_map, tips)
    
    if base_dir is None:
        base_dir = os.path.dirname(wav_path)
    
    timeline_path = create_phoneme_timeline(results, tg_path)
    
    return {
        'phoneme_feedback': results,
        'phoneme_timeline': timeline_path,
        'word_feedback': word_feedback,
        'transcript': transcript,
    }

REST API USING FLASK TO SERVE THE FRONT END

In [None]:
app = Flask(__name__)

CORS(app, resources={r"/*": {"origins": "*"}})

@app.route('/score', methods=['POST'])
def score_route():
    try:
        # Check for uploaded audio file
        if 'audio' not in request.files:
            return jsonify({"error": "Audio file is required"}), 400

        # Get the uploaded audio file and transcript text
        audio_file = request.files['audio']
        transcript = request.form.get('transcript')
        use_enhanced = request.form.get('use_enhanced', 'true').lower() == 'true'

        if not transcript:
            return jsonify({"error": "Transcript text is required"}), 400

        print(f"Received audio file: {audio_file.filename}")

        # Save the uploaded audio file to a temporary directory
        temp_dir = tempfile.mkdtemp()
        audio_path = os.path.join(temp_dir, audio_file.filename)
        audio_file.save(audio_path)

        # Convert MP3 to WAV if necessary
        if audio_path.lower().endswith('.mp3'):
            print("Converting MP3 to WAV...")
            wav_path = convert_mp3_to_wav(audio_path, audio_path.replace('.mp3', '.wav'))
            print(f"Converted wav_path: {wav_path}")
        elif audio_path.lower().endswith('.webm'):
            print("Converting WEBM to WAV...")
            wav_path = convert_webm_to_wav(audio_path, audio_path.replace('.webm', '.wav'))
            print(f"Converted wav_path: {wav_path}")
        else:
            wav_path = audio_path

        # Save the transcript text to a temporary file
        transcript_path = os.path.join(temp_dir, "transcript.txt")
        with open(transcript_path, "w") as f:
            # f.write(transcript) ensure transcript is one long strings, if there's newlines and more than one whitespace anywhere, tuncate to one long string whith a maximum one character whitespace whic his a normal space
            f.write(' '.join(transcript.split()))
            
            
        print(f"Expected content of audio file:" + ' '.join(transcript.split()))

        # Log before processing audio
        print("Processing audio...")

        # Choose processing method based on flag
        if use_enhanced:
            res = process_audio_enhanced(wav_path, transcript_path, base_dir=None)

        # Log the result
        print("Processing complete.")

        # Clean up temporary files
        shutil.rmtree(temp_dir)

        # Return the result
        return jsonify(res), 200

    except Exception as e:
        # Log the error
        print("Error occurred:", str(e))
        return jsonify({"error": str(e)}), 500

@app.route('/get-timeline', methods=['GET'])
def get_timeline():
    """Serve the phoneme timeline image based on the provided relative path."""
    # Get the relative path from the query parameter
    relative_path = request.args.get('path')
    
    if not relative_path:
        return jsonify({"error": "No path provided"}), 400

    # Construct the absolute path
    absolute_path = os.path.abspath(relative_path)

    # Check if the file exists
    if os.path.exists(absolute_path):
        return send_file(absolute_path, mimetype='image/png')
    else:
        return jsonify({"error": f"File not found: {relative_path}"}), 404
    
"""
timeline URL request: localhost:5000/get-timeline?path=phoneme_timeline.png
"""

if __name__ == '__main__':
    init_models()
    app.run(host='0.0.0.0', port=5000)