In [1]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.8.0+cu129
CUDA available: True
CUDA version: 12.9


In [2]:
%pwd

'd:\\Projects\\Quran-App-Backend\\Research'

In [3]:
import os

os.chdir("../")

In [4]:
%pwd

'd:\\Projects\\Quran-App-Backend'

In [None]:
import os
import torch
import librosa
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# -----------------------------
# Model Setup
# -----------------------------
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)

# -----------------------------
# Extract Embedding Function
# -----------------------------
def extract_embedding(file_path):
    y, sr = librosa.load(file_path, sr=16000)
    inputs = feature_extractor(y, sampling_rate=sr, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(device)

    with torch.no_grad():
        outputs = model(input_values)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

    return embedding

# -----------------------------
# Compare Two Audio Files
# -----------------------------
def similarity_wav2vec(file1, file2):
    emb1 = extract_embedding(file1)
    emb2 = extract_embedding(file2)
    score = cosine_similarity(emb1, emb2)[0][0]
    return round(score * 100, 2)

# -----------------------------
# Match User Input Against All Reference Folders
# -----------------------------
def match_alphabet(user_audio, reference_root):
    results = []

    # Each subfolder = one alphabet
    for folder_name in os.listdir(reference_root):
        folder_path = os.path.join(reference_root, folder_name)
        if not os.path.isdir(folder_path):
            continue

        best_score_for_letter = 0.0

        # Compare user audio with every .wav file in this alphabet folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                ref_path = os.path.join(folder_path, file_name)
                try:
                    score = similarity_wav2vec(ref_path, user_audio)
                    best_score_for_letter = max(best_score_for_letter, score)
                except Exception as e:
                    print(f"⚠️ Skipping {ref_path}: {e}")

        results.append((folder_name, best_score_for_letter))

    # Sort results by best similarity
    results.sort(key=lambda x: x[1], reverse=True)

    best_match = results[0] if results else ("None", 0.0)
    return best_match, results

# -----------------------------
# Example Usage
# -----------------------------
if __name__ == "__main__":
    reference_root = "Data/Arabic_Alphabets"   # folder with subfolders for each alphabet
    user_input = "Data/user_input1.wav"         # user spoken file

    best_match, all_results = match_alphabet(user_input, reference_root)

    print(f"\n✅ Best Match: {best_match[0]} with {best_match[1]}% similarity\n")

    print("📊 Full Ranking:")
    for letter, score in all_results:
        print(f"{letter:10s} → {score}%")


Fetching 1 files: 100%|██████████| 1/1 [00:00<00:00, 1335.77it/s]



✅ Best Match: جيم with 100.0% similarity

📊 Full Ranking:
جيم        → 100.0%
سين        → 100.0%
ميم        → 100.0%
الف        → 99.99%
باء        → 99.99%
تاء        → 99.99%
ثاء        → 99.99%
حاء        → 99.99%
خاء        → 99.99%
دال        → 99.99%
ذال        → 99.99%
راء        → 99.99%
زاي        → 99.99%
صاد        → 99.99%
ضاد        → 99.99%
طاء        → 99.99%
ظاء        → 99.99%
فاء        → 99.99%
قاف        → 99.99%
كاف        → 99.99%
لام        → 99.99%
نون        → 99.99%
هاء        → 99.99%
واو        → 99.99%
ياء        → 99.99%


In [None]:
import os
import torch
import librosa
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

# ============================================================
# CONFIGURATION
# ============================================================

MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"
REFERENCE_ROOT = "Data/Arabic_Alphabets"   # Path to all alphabet folders
TARGET_LETTER = "باء"  //                    # <-- change this to the letter you want to evaluate
MODEL_FILE = f"{TARGET_LETTER}_model.pkl"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ============================================================
# MODEL LOADING
# ============================================================

print("🔹 Loading Wav2Vec2 model...")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
print("✅ Model loaded successfully.\n")

# ============================================================
# FEATURE EXTRACTION
# ============================================================

def extract_embedding(file_path):
    """Extract mean Wav2Vec2 embedding for a single .wav file."""
    y, sr = librosa.load(file_path, sr=16000)
    inputs = feature_extractor(y, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(inputs.input_values.to(DEVICE))
        return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# ============================================================
# BUILD LETTER MODEL
# ============================================================

def build_letter_model(letter_folder, save_as):
    """Compute average embedding from all .wav files of a specific letter."""
    print(f"🎙️ Building model for '{TARGET_LETTER}' from {letter_folder} ...")
    embeddings = []
    for file in os.listdir(letter_folder):
        if file.endswith(".wav"):
            path = os.path.join(letter_folder, file)
            emb = extract_embedding(path)
            embeddings.append(emb)
            print(f"  ✅ Processed: {file}")
    if not embeddings:
        raise ValueError("No .wav files found in the given folder.")
    model_emb = np.mean(np.vstack(embeddings), axis=0, keepdims=True)
    with open(save_as, "wb") as f:
        pickle.dump(model_emb, f)
    print(f"✅ Model saved as {save_as}\n")
    return model_emb

# ============================================================
# COMPARE USER AUDIO
# ============================================================

def check_pronunciation(user_audio, model_file):
    """Compare user pronunciation with reference model."""
    with open(model_file, "rb") as f:
        ref_emb = pickle.load(f)
    user_emb = extract_embedding(user_audio)
    score = cosine_similarity(ref_emb, user_emb)[0][0]
    similarity = round(score * 100, 2)
    print(f"🎧 Similarity to '{TARGET_LETTER}': {similarity}%")
    return similarity

# ============================================================
# MAIN SCRIPT
# ============================================================

if __name__ == "__main__":
    letter_folder = os.path.join(REFERENCE_ROOT, TARGET_LETTER)

    # 1️⃣ Build the model if not already saved
    if not os.path.exists(MODEL_FILE):
        build_letter_model(letter_folder, MODEL_FILE)

    # 2️⃣ Compare user pronunciation
    user_audio = "Data/user_input.wav"  # or change to user_input1.wav etc.
    similarity = check_pronunciation(user_audio, MODEL_FILE)

    # 3️⃣ Interpretation
    if similarity >= 90:
        print("✅ Excellent pronunciation! (Correct)")
    elif 70 <= similarity < 90:
        print("⚠️ Close! Try improving pronunciation.")
    else:
        print("❌ Incorrect pronunciation.")


🔹 Loading Wav2Vec2 model...


Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


✅ Model loaded successfully.

🎙️ Building model for 'باء' from Data/Arabic_Alphabets\باء ...
  ✅ Processed: 0_1_Ba_T1 (2).wav
  ✅ Processed: 0_1_Ba_T1 (3).wav
  ✅ Processed: 0_1_Ba_T1 (4).wav
  ✅ Processed: 0_1_Ba_T1 (5).wav
  ✅ Processed: 0_1_Ba_T1 (6).wav
  ✅ Processed: 0_1_Ba_T1 (7).wav
  ✅ Processed: 0_1_Ba_T1 (8).wav
  ✅ Processed: 0_1_Ba_T1.wav
  ✅ Processed: 0_1_Ba_T10 (2).wav
  ✅ Processed: 0_1_Ba_T10 (3).wav
  ✅ Processed: 0_1_Ba_T10 (4).wav
  ✅ Processed: 0_1_Ba_T10 (5).wav
  ✅ Processed: 0_1_Ba_T10 (6).wav
  ✅ Processed: 0_1_Ba_T10 (7).wav
  ✅ Processed: 0_1_Ba_T10 (8).wav
  ✅ Processed: 0_1_Ba_T10.wav
  ✅ Processed: 0_1_Ba_T100 (2).wav
  ✅ Processed: 0_1_Ba_T100 (3).wav
  ✅ Processed: 0_1_Ba_T100 (4).wav
  ✅ Processed: 0_1_Ba_T100 (5).wav
  ✅ Processed: 0_1_Ba_T100 (6).wav
  ✅ Processed: 0_1_Ba_T100 (7).wav
  ✅ Processed: 0_1_Ba_T100 (8).wav
  ✅ Processed: 0_1_Ba_T100.wav
  ✅ Processed: 0_1_Ba_T11 (2).wav
  ✅ Processed: 0_1_Ba_T11 (3).wav
  ✅ Processed: 0_1_Ba_T11 (4).wav
 

In [8]:
import os
import pickle
import math
import numpy as np
import torch
import librosa
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from sklearn.metrics.pairwise import cosine_similarity

# ---------------- CONFIG ----------------
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53"  # or smaller variant if you want speed
REFERENCE_ROOT = "Data/Arabic_Alphabets"
TARGET_LETTER = "الف"
MODEL_CACHE = f"{TARGET_LETTER}_seq_cache.pkl"  # store frame-level embeddings per ref file
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Controls how distance maps to similarity. You may tune this.
# similarity = 100 * exp(-BETA * dtw_distance)
BETA = 3.0

# If your sequences are long, you can downsample frame dimension in time (take every k-th frame)
FRAME_DOWNSAMPLE = 1  # set to 2 or 3 if you need speed vs accuracy

# ---------------- Model load ----------------
print("Loading model...")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()
print("Model loaded.")

# ---------------- Utilities ----------------
def trim_silence_and_load(path, sr=16000, top_db=25):
    y, _ = librosa.load(path, sr=sr)
    yt, _ = librosa.effects.trim(y, top_db=top_db)
    # if trimming removed everything, fallback to original
    if yt.size == 0:
        return y
    return yt

def extract_frame_embeddings(wav_path):
    """Return numpy array shape (T, D) of frame-level embeddings (no mean pooling)."""
    y = trim_silence_and_load(wav_path, sr=16000, top_db=25)
    inputs = feature_extractor(y, sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs.input_values.to(DEVICE)
    with torch.no_grad():
        outputs = model(input_values)
        frames = outputs.last_hidden_state[0].cpu().numpy()  # shape (T, D)
    # optional downsample in time
    if FRAME_DOWNSAMPLE > 1:
        frames = frames[::FRAME_DOWNSAMPLE]
    # normalize each frame (L2) to make cosine distances meaningful
    norms = np.linalg.norm(frames, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    frames = frames / norms
    return frames  # (T, D)

def cosine_cost_matrix(a, b):
    """Return cost matrix where cost[i,j] = 1 - cosine(a[i], b[j])"""
    # both a, b normalized per-frame -> cosine_similarity gives cosine
    sim = np.dot(a, b.T)  # shape (T_a, T_b)
    # clip numerical errors
    sim = np.clip(sim, -1.0, 1.0)
    cost = 1.0 - sim
    return cost

def dtw_distance_from_cost(cost):
    """Classic DTW dynamic programming on cost matrix. Returns final distance (float)."""
    n, m = cost.shape
    # dp matrix
    dp = np.full((n + 1, m + 1), np.inf, dtype=float)
    dp[0, 0] = 0.0
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            choices = (dp[i-1, j],   # insertion
                       dp[i, j-1],   # deletion
                       dp[i-1, j-1]) # match
            dp[i, j] = cost[i-1, j-1] + min(choices)
    return dp[n, m]

def dtw_distance(seq_a, seq_b):
    """Compute DTW distance between two frame sequences (vectors normalized)."""
    if seq_a.shape[0] == 0 or seq_b.shape[0] == 0:
        return float("inf")
    cost = cosine_cost_matrix(seq_a, seq_b)
    return dtw_distance_from_cost(cost)

def distance_to_similarity(dist, beta=BETA):
    """Convert nonnegative distance to similarity percentage [0,100].
       similarity = 100 * exp(-beta * dist)
       Adjust 'beta' to calibrate sensitivity."""
    sim = 100.0 * math.exp(-beta * dist)
    # clip
    if sim < 0.0:
        sim = 0.0
    if sim > 100.0:
        sim = 100.0
    return sim

# ---------------- Build cache (per-reference file frame sequences) ----------------
def build_sequence_cache(reference_root, target_letter, cache_file=MODEL_CACHE):
    folder = os.path.join(reference_root, target_letter)
    if not os.path.isdir(folder):
        raise FileNotFoundError(f"Folder not found: {folder}")
    cache = {}
    print(f"Building sequence cache for letter '{target_letter}' from: {folder}")
    for fname in os.listdir(folder):
        if not fname.lower().endswith(".wav"):
            continue
        path = os.path.join(folder, fname)
        try:
            seq = extract_frame_embeddings(path)  # (T, D)
            cache[fname] = seq
            print(f"  processed {fname}: frames={seq.shape[0]}")
        except Exception as e:
            print(f"  skipped {fname}: {e}")
    # save
    with open(cache_file, "wb") as f:
        pickle.dump(cache, f)
    print(f"Saved cache to {cache_file}. Total refs: {len(cache)}")
    return cache

# ---------------- Compare a user file to target letter ----------------
def verify_single_word(user_wav, cache_file=MODEL_CACHE):
    # load cache
    if not os.path.exists(cache_file):
        raise FileNotFoundError("Cache not found. Run build_sequence_cache first.")
    with open(cache_file, "rb") as f:
        cache = pickle.load(f)
    user_seq = extract_frame_embeddings(user_wav)
    if user_seq.shape[0] == 0:
        print("Warning: user audio produced zero frames after trimming.")
        return 0.0

    # compute DTW distance vs each reference, take minimum
    best_dist = float("inf")
    for ref_name, ref_seq in cache.items():
        try:
            d = dtw_distance(ref_seq, user_seq)
            if d < best_dist:
                best_dist = d
        except Exception as e:
            print(f"DTW failed for {ref_name}: {e}")

    similarity = distance_to_similarity(best_dist)
    return similarity, best_dist

# ---------------- CLI usage ----------------
if __name__ == "__main__":
    # Build cache if not exists
    if not os.path.exists(MODEL_CACHE):
        build_sequence_cache(REFERENCE_ROOT, TARGET_LETTER, MODEL_CACHE)

    # Example check
    user_file = "Data/user_input2.wav"  # change as needed
    sim, dist = verify_single_word(user_file, MODEL_CACHE)
    print(f"\nDTW distance = {dist:.4f}")
    print(f"Similarity to '{TARGET_LETTER}' = {sim:.2f}%")
    if sim >= 90:
        print("Result: ✅ Correct pronunciation.")
    elif sim >= 70:
        print("Result: ⚠️ Partial / close.")
    else:
        print("Result: ❌ Incorrect.")


Loading model...


Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


Model loaded.

DTW distance = 0.2284
Similarity to 'الف' = 50.40%
Result: ❌ Incorrect.
