<a href="https://colab.research.google.com/github/DuongVinh2609/pronunciation-assessment/blob/main/REALTIME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Install required libraries
!pip install torch torchvision torchaudio transformers librosa numpy tqdm scipy matplotlib seaborn scikit-learn openai-whisper

Mounted at /content/drive
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.

In [None]:
import os
import torch
import torchaudio
import librosa
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model, AutoTokenizer, AutoModel
from torch import nn
from tqdm import tqdm
import json
import logging
import whisper

In [None]:
# Thiết lập logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

# Lớp cấu hình
class Config:
    DATA_DIR = "/content/drive/MyDrive/speechocean762"
    WAVE_DIR = os.path.join(DATA_DIR, "WAVE")
    PREPROCESS_DIR = os.path.join(DATA_DIR, "preprocessed")
    RESOURCE_DIR = os.path.join(DATA_DIR, "resource")
    SAMPLE_RATE = 16000
    MFCC_DIM = 13
    WAV2VEC_DIM = 768
    SPEAKER_EMB_DIM = 16
    MAX_MFCC_LEN = 450
    MAX_PHONES = 10
    MAX_WORDS = 10
    MFCC_HOP_LENGTH = 160
    MFCC_N_FFT = 400
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    MAX_SEQ_LEN = 55

logger.info(f"Using device: {Config.DEVICE}")

In [None]:
# Tải các mô hình được huấn luyện trước
WAV2VEC_PROCESSOR = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-100h")
WAV2VEC_MODEL = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-100h").to(Config.DEVICE).eval()
TEXT_TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
TEXT_ENCODER = AutoModel.from_pretrained("bert-base-uncased").to(Config.DEVICE).eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-100h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
class TransformerEncoderModule(nn.Module):
    def __init__(self, dim, nhead=8, num_layers=2, dropout=0.1):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim, nhead=nhead, dim_feedforward=dim*4,
            dropout=dropout, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.norm = nn.LayerNorm(dim)

    def forward(self, x, mask=None):
        if mask is not None and not mask.any():
            logger.warning("Empty mask, returning input tensor")
            return x
        if mask is not None:
            expected_len = x.size(1)
            if mask.size(1) != expected_len:
                if mask.size(1) < expected_len:
                    pad_size = expected_len - mask.size(1)
                    padding = torch.zeros((mask.size(0), pad_size), dtype=mask.dtype, device=mask.device)
                    mask = torch.cat([mask, padding], dim=1)
                else:
                    mask = mask[:, :expected_len]
        return self.transformer(self.norm(x), src_key_padding_mask=~mask if mask is not None else None)

class EnhancedPronunciationModel(nn.Module):
    def __init__(self, num_speakers):
        super().__init__()
        self.hidden_dim = 256
        self.phone_dim = 64
        self.mfcc_dim = 64
        self.wav2vec_dim = 256
        self.prosodic_dim = 32
        self.extra_dim = 32
        self.text_dim = 256
        self.speaker_emb_dim = Config.SPEAKER_EMB_DIM
        self.num_cls_tokens = 5

        self.mfcc_net = nn.Sequential(
            nn.Conv1d(Config.MFCC_DIM, 64, 5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, 128, 3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(128, self.mfcc_dim),
            nn.ReLU()
        )
        self.wav2vec_net = nn.Sequential(
            nn.Linear(Config.WAV2VEC_DIM, self.wav2vec_dim),
            nn.LayerNorm(self.wav2vec_dim),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.prosodic_net = nn.Sequential(
            nn.Linear(7, self.prosodic_dim),
            nn.LayerNorm(self.prosodic_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.extra_proj = nn.Sequential(
            nn.Linear(5, self.extra_dim),  # 5 inputs: pause_ratio, speaking_rate, energy, zcr, word_coverage
            nn.LayerNorm(self.extra_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.text_proj = nn.Sequential(
            nn.Linear(768, self.text_dim),
            nn.LayerNorm(self.text_dim),
            nn.GELU(),
            nn.Dropout(0.2)
        )
        self.spk_emb = nn.Embedding(num_speakers, self.speaker_emb_dim)
        self.phone_enc = nn.Linear(2, self.phone_dim)
        self.word_comb = nn.Linear(self.phone_dim + 2 + self.mfcc_dim + self.wav2vec_dim, self.hidden_dim)
        self.cls_tokens = nn.Parameter(torch.randn(1, self.num_cls_tokens, self.hidden_dim))
        nn.init.trunc_normal_(self.cls_tokens, std=0.02)
        self.pos_embed = self._get_sinusoid_encoding(Config.MAX_SEQ_LEN, self.hidden_dim)
        self.pos_embed = nn.Parameter(self.pos_embed, requires_grad=False)
        self.transformer = TransformerEncoderModule(self.hidden_dim, nhead=8, num_layers=4, dropout=0.2)

        self.word_head = nn.Sequential(
            nn.Linear(self.hidden_dim, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        self.acc_head = nn.Linear(self.hidden_dim, 1)
        self.flu_head = nn.Linear(self.hidden_dim, 1)
        self.com_head = nn.Sequential(
            nn.Linear(self.hidden_dim, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        self.pro_head = nn.Linear(self.hidden_dim, 1)
        self.ovr_head = nn.Linear(self.hidden_dim, 1)

    def _get_sinusoid_encoding(self, seq_len, d_model):
        position = torch.arange(seq_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe = torch.zeros(seq_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    def forward(self, x):
        logger.debug(f"Input mfcc shape: {x['mfcc'].shape}")
        mfcc = x['mfcc'].permute(0, 2, 1)
        logger.debug(f"Permuted mfcc shape: {mfcc.shape}")
        mfcc_feat = self.mfcc_net(mfcc)
        logger.debug(f"mfcc_feat shape: {mfcc_feat.shape}")
        wav2vec = self.wav2vec_net(x['wav2vec'])
        phone = self.phone_enc(x['phone_features'])
        phone_mask = x['phone_mask']
        word_mask = x['word_mask']
        text = self.text_proj(x['text_embed'])
        prosodic = self.prosodic_net(x['prosodic'])
        extra_tensors = [
            torch.atleast_2d(x['pause_ratio']),
            torch.atleast_2d(x['speaking_rate']),
            torch.atleast_2d(x['energy']),
            torch.atleast_2d(x['zcr']),
            torch.atleast_2d(x['word_coverage'])
        ]
        logger.debug(f"Extra tensors shapes: {[t.shape for t in extra_tensors]}")
        extra = self.extra_proj(torch.cat(extra_tensors, dim=-1))
        spk = self.spk_emb(x['speaker_id'].squeeze(-1))

        word_feats = extract_word_level_features_from_scores(self, x, mfcc_feat, wav2vec, phone)

        batch_size = phone.size(0)
        cls_tokens = self.cls_tokens.expand(batch_size, -1, -1)
        phone_tokens = nn.Linear(self.phone_dim, self.hidden_dim).to(phone.device)(phone)
        word_tokens = word_feats

        seq_tokens = torch.cat([cls_tokens, phone_tokens, word_tokens], dim=1)
        seq_mask = torch.cat([
            torch.ones(batch_size, self.num_cls_tokens, device=phone_mask.device, dtype=torch.bool),
            phone_mask, word_mask
        ], dim=1)

        seq_tokens = seq_tokens + self.pos_embed[:, :seq_tokens.size(1)].to(seq_tokens.device)
        seq_tokens = self.transformer(seq_tokens, seq_mask)

        cls_out = seq_tokens[:, :self.num_cls_tokens]
        word_out = seq_tokens[:, self.num_cls_tokens + Config.MAX_PHONES:]

        word_scores = self.word_head(word_out)
        acc = self.acc_head(cls_out[:, 0])
        flu = self.flu_head(cls_out[:, 1])
        com = self.com_head(cls_out[:, 2]) * 1.2  # Scale to boost Completeness
        logger.debug(f"com score before scaling: {self.com_head(cls_out[:, 2])}")
        pro = self.pro_head(cls_out[:, 3])
        ovr = self.ovr_head(cls_out[:, 4])
        utterance_out = torch.cat([acc, flu, com, pro, ovr], dim=-1)

        return {'utterance': utterance_out, 'word_scores': word_scores}

In [None]:
def extract_word_level_features_from_scores(model, x, mfcc_feat, wav2vec_feat, phone_encoded):
    B = mfcc_feat.size(0)
    assert B == 1, "Real-time inference expects batch size of 1"
    L = phone_encoded.size(1)
    word_feats = []
    utt = x['utt_id']
    detail = x['score_dict'][utt]['words']
    if not detail:
        word_feats.append(torch.zeros(Config.MAX_WORDS, model.hidden_dim, device=Config.DEVICE))
    else:
        segments = build_segments_from_scores(detail, L)
        reps = []
        for (s, e), w in zip(segments, detail):
            slice = phone_encoded[0, s:e]
            if e > s:
                slice = slice.mean(dim=0)
                audio = x['audio'][0].cpu().numpy()
                start_idx = int(s * len(audio) / Config.MAX_PHONES)
                end_idx = int(e * len(audio) / Config.MAX_PHONES)
                segment_energy = librosa.feature.rms(y=audio[start_idx:end_idx])[0]
                energy_var = np.var(segment_energy) if len(segment_energy) > 1 else 0.0
                phone_acc = torch.tensor(float(0.98 - 0.3 * min(energy_var, 1.0)), device=slice.device)  # Softer penalty
                error_tensor = torch.tensor(0.0, device=slice.device)
                slice = torch.cat([slice, phone_acc.unsqueeze(0), error_tensor.unsqueeze(0)])
            else:
                slice = torch.zeros(model.phone_dim + 2, device=Config.DEVICE)
            context = torch.cat([mfcc_feat[0], wav2vec_feat[0]]).float()
            combined = torch.cat([slice, context]).float()
            reps.append(model.word_comb(combined))
        while len(reps) < Config.MAX_WORDS:
            reps.append(torch.zeros(model.hidden_dim, device=Config.DEVICE))
        word_feats.append(torch.stack(reps[:Config.MAX_WORDS]))
    return torch.stack(word_feats)

def build_segments_from_scores(words_info, total_length):
    # Use frame_boundaries from process_audio instead of phone counts
    frame_boundaries = words_info[0]['frame_boundaries']  # Extract from score_dict
    segments = []
    for i in range(len(frame_boundaries) - 1):
        start = frame_boundaries[i]
        end = frame_boundaries[i + 1]
        segments.append((start, end))
    return segments

def extract_phone_level_features(audio, num_phones):
    phone_features = torch.zeros((Config.MAX_PHONES, 2), dtype=torch.float32)
    mask = torch.zeros(Config.MAX_PHONES, dtype=torch.bool)
    num_phones = min(num_phones, Config.MAX_PHONES)
    if num_phones > 0:
        energy = librosa.feature.rms(y=audio)[0]
        duration = len(audio) / Config.SAMPLE_RATE
        phone_duration = duration / max(1, num_phones)
        for i in range(num_phones):
            start = int(i * len(energy) / num_phones)
            end = int((i + 1) * len(energy) / num_phones)
            phone_energy = np.mean(energy[start:end]) if end > start else 0.0
            phone_features[i, 0] = float(phone_energy)
            phone_features[i, 1] = float(phone_duration)
        mask[:num_phones] = True
    return {'phone_features': phone_features, 'mask': mask}

In [None]:
# Tải mô hình đã huấn luyện
checkpoint_path = os.path.join(Config.PREPROCESS_DIR, "best_model_ever_create.pt")
num_speakers = 125  # Số lượng loa từ tập huấn luyện
model = EnhancedPronunciationModel(num_speakers=num_speakers).to(Config.DEVICE)
try:
    checkpoint = torch.load(checkpoint_path, map_location=Config.DEVICE)
    # Log missing/unexpected keys
    state_dict = checkpoint['model_state_dict']

    # Filter out unexpected keys from state_dict
    filtered_state_dict = {k: v for k, v in state_dict.items() if k in model.state_dict()}

    model.load_state_dict(filtered_state_dict, strict=False)  # Use strict=False to ignore missing keys
    model.eval()
    logger.info("Model loaded successfully!")
except FileNotFoundError:
    logger.error(f"Checkpoint not found at {checkpoint_path}")
    raise
# Xử lý âm thanh và trích xuất đặc trưng
def process_audio(audio_path, transcript=None):
    try:
        waveform, orig_sr = torchaudio.load(audio_path)
        if waveform.ndim == 1:
            waveform = waveform.unsqueeze(0)
        if orig_sr != Config.SAMPLE_RATE:
            waveform = torchaudio.functional.resample(waveform, orig_sr, Config.SAMPLE_RATE)
        audio = waveform[0]
        y = audio.numpy()
        if len(y) == 0:
            raise ValueError("Audio file is empty or corrupted.")
        mfcc = librosa.feature.mfcc(
            y=y, sr=Config.SAMPLE_RATE, n_mfcc=Config.MFCC_DIM,
            hop_length=Config.MFCC_HOP_LENGTH, n_fft=Config.MFCC_N_FFT
        )
        mfcc = torch.FloatTensor(mfcc.T)
        if mfcc.shape[0] > Config.MAX_MFCC_LEN:
            mfcc = mfcc[:Config.MAX_MFCC_LEN]
        elif mfcc.shape[0] < Config.MAX_MFCC_LEN:
            mfcc = torch.nn.functional.pad(mfcc, (0, 0, 0, Config.MAX_MFCC_LEN - mfcc.shape[0]))
        with torch.no_grad():
            inputs = WAV2VEC_PROCESSOR(waveform, return_tensors="pt", sampling_rate=Config.SAMPLE_RATE)
            inputs['input_values'] = inputs['input_values'].squeeze(0) if inputs['input_values'].ndim == 3 else inputs['input_values']
            inputs = {k: v.to(Config.DEVICE) for k, v in inputs.items()}
            wav2vec_output = WAV2VEC_MODEL(**inputs)
            wav2vec_feat = wav2vec_output.last_hidden_state.mean(dim=1).cpu().squeeze()
        pitch = librosa.yin(y, fmin=75, fmax=500)
        pitch = pitch[~np.isnan(pitch)]
        duration = len(y) / Config.SAMPLE_RATE
        # Clean transcript by removing punctuation
        if transcript:
            transcript = ''.join(c for c in transcript if c.isalpha() or c.isspace()).strip()
        word_sequence = transcript.lower().split() if transcript else ["sil"]
        prosodic_values = [
            float(np.mean(pitch)) if len(pitch) > 0 else 0.0,
            float(np.std(pitch)) if len(pitch) > 0 else 0.0,
            float(np.mean(librosa.feature.rms(y=y)[0])),
            float(np.mean(librosa.feature.zero_crossing_rate(y)[0])),
            float(duration),
            float(len(word_sequence) / duration if duration > 0 else 0.0),
            float(len(word_sequence))
        ]
        prosodic = torch.FloatTensor(prosodic_values).unsqueeze(0)
        # Use Whisper medium model
        whisper_model = whisper.load_model("medium")
        whisper_result = whisper_model.transcribe(audio_path, word_timestamps=True)
        word_timestamps = [(w['start'], w['end'], w['word']) for w in whisper_result['segments'][0]['words']]
        logger.debug(f"word_timestamps: {word_timestamps}")
        frame_boundaries = [0]
        for start, end, _ in word_timestamps[:len(word_sequence)]:
            frame_idx = int((end / duration) * Config.MAX_PHONES)
            frame_boundaries.append(min(frame_idx, Config.MAX_PHONES))
        while len(frame_boundaries) < len(word_sequence) + 1:
            frame_boundaries.append(Config.MAX_PHONES)
        frame_boundaries = np.array(frame_boundaries[:len(word_sequence) + 1], dtype=np.int64)
        logger.debug(f"frame_boundaries: {frame_boundaries}")
        num_phones = min(len(word_sequence) * 3, Config.MAX_PHONES)
        phone_data = extract_phone_level_features(y, num_phones)
        text_embed = torch.zeros(768)
        if transcript:
            encoded = TEXT_TOKENIZER(transcript, return_tensors="pt", padding=True, truncation=True)
            encoded = {k: v.to(Config.DEVICE) for k, v in encoded.items()}
            with torch.no_grad():
                text_embed = TEXT_ENCODER(**encoded).last_hidden_state.mean(dim=1).cpu().squeeze()
        energy_val = float(np.mean(librosa.feature.rms(y=y)))
        zcr_val = float(np.mean(librosa.feature.zero_crossing_rate(y)[0]))
        intervals = librosa.effects.split(y, top_db=30)
        speech_len = sum([(e - s) for s, e in intervals]) if intervals.size else 0
        pause_ratio_val = float(1 - speech_len / len(y)) if len(y) > 0 else 0.0
        speaking_rate_val = float(len(word_sequence) / duration if duration > 0 else 0.0)
        word_coverage_val = 1.0
        energy = torch.tensor([[energy_val]], dtype=torch.float32)
        zcr = torch.tensor([[zcr_val]], dtype=torch.float32)
        pause_ratio = torch.tensor([[pause_ratio_val]], dtype=torch.float32)
        speaking_rate = torch.tensor([[speaking_rate_val]], dtype=torch.float32)
        word_coverage = torch.tensor([[word_coverage_val]], dtype=torch.float32)
        words = word_sequence if transcript else []
        word_count = min(len(words), Config.MAX_WORDS) if transcript else 0
        features = {
            'utt_id': 'real_time_utt',
            'mfcc': mfcc.unsqueeze(0),
            'wav2vec': wav2vec_feat.unsqueeze(0),
            'prosodic': prosodic,
            'speaker_id': torch.LongTensor([0]).unsqueeze(0),
            'phone_features': phone_data['phone_features'].unsqueeze(0),
            'phone_mask': phone_data['mask'].unsqueeze(0),
            'audio': audio.unsqueeze(0),
            'word_count': torch.tensor([word_count], dtype=torch.long),
            'word_prosodic': torch.zeros(1, Config.MAX_WORDS, dtype=torch.float32),
            'text_embed': text_embed.unsqueeze(0),
            'pause_ratio': pause_ratio,
            'speaking_rate': speaking_rate,
            'energy': energy,
            'zcr': zcr,
            'word_coverage': word_coverage,
            'word_mask': torch.zeros(1, Config.MAX_WORDS, dtype=torch.bool),
            'score_dict': {
                'real_time_utt': {
                    'words': [{'text': word, 'phones': [], 'frame_boundaries': frame_boundaries} for word in words[:Config.MAX_WORDS]]
                }
            },
            'frame_boundaries': torch.tensor(np.array(frame_boundaries), dtype=torch.long)  # Fix tensor warning
        }
        if word_count > 0:
            features['word_mask'][0, :word_count] = True
        return features, words
    except Exception as e:
        logger.error(f"Error processing audio: {str(e)}")
        raise

In [None]:
def run_real_time_inference(features):
    try:
        features = {k: (v.to(Config.DEVICE) if isinstance(v, torch.Tensor) else v) for k, v in features.items()}
        with torch.no_grad():
            outputs = model(features)
        utterance_out = outputs['utterance'].cpu().numpy()[0]
        utterance_out = np.concatenate([
            1 / (1 + np.exp(-utterance_out[:2])),  # acc, flu
            utterance_out[2:3],  # com (already sigmoided)
            1 / (1 + np.exp(-utterance_out[3:]))  # pro, ovr
        ])
        word_scores = outputs['word_scores'].cpu().numpy()[0].squeeze(-1)
        word_probs = 1 / (1 + np.exp(-word_scores))
        word_preds = (word_probs > 0.7).astype(int)
        return utterance_out, word_probs, word_preds
    except Exception as e:
        logger.error(f"Error during inference: {str(e)}")
        raise

In [None]:
def display_results(utterance_scores, word_probs, word_preds, words):
    print("\n=== Pronunciation Assessment Results ===")
    print("Utterance-Level Scores (0-10 scale):")
    aspects = ['Accuracy', 'Fluency', 'Completeness', 'Prosody', 'Overall']
    for i, aspect in enumerate(aspects):
        score = utterance_scores[i] * 10
        print(f"{aspect}: {score:.2f}/10")

    if words:
        print("\nWord-Level Correctness:")
        for j, (word, prob, pred) in enumerate(zip(words[:Config.MAX_WORDS], word_probs, word_preds)):
            pred_class = "Correct" if pred == 1 else "Incorrect"
            print(f"  Word {j+1}: '{word}' - Pred = {pred_class} (Prob = {prob:.3f})")
    else:
        print("\nNo word-level feedback available (transcript not provided).")

def main():
    from google.colab import files
    print("Upload your audio file (WAV, 16kHz, mono):")
    try:
        uploaded = files.upload()
        if not uploaded:
            raise ValueError("No file uploaded.")
        audio_path = list(uploaded.keys())[0]
        transcript = input("Enter the transcript (required for word-level feedback, press Enter to skip): ").strip()
        # Clean transcript
        if transcript:
            transcript = ''.join(c for c in transcript if c.isalpha() or c.isspace()).strip()

        features, words = process_audio(audio_path, transcript)
        utterance_scores, word_probs, word_preds = run_real_time_inference(features)
        display_results(utterance_scores, word_probs, word_preds, words)

    except Exception as e:
        logger.error(f"Error in main: {str(e)}")
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()

Upload your audio file (WAV, 16kHz, mono):


Saving THE woman look up to him.wav to THE woman look up to him.wav
Enter the transcript (required for word-level feedback, press Enter to skip): THE WOMAN LOOKED UP TO HIM.


100%|█████████████████████████████████████| 1.42G/1.42G [00:26<00:00, 58.7MiB/s]



=== Pronunciation Assessment Results ===
Utterance-Level Scores (0-10 scale):
Accuracy: 6.23/10
Fluency: 6.19/10
Completeness: 8.77/10
Prosody: 6.09/10
Overall: 5.84/10

Word-Level Correctness:
  Word 1: 'the' - Pred = Correct (Prob = 0.705)
  Word 2: 'woman' - Pred = Incorrect (Prob = 0.699)
  Word 3: 'looked' - Pred = Incorrect (Prob = 0.697)
  Word 4: 'up' - Pred = Incorrect (Prob = 0.698)
  Word 5: 'to' - Pred = Incorrect (Prob = 0.700)
  Word 6: 'him' - Pred = Incorrect (Prob = 0.593)


  output = torch._nested_tensor_from_mask(
