In [None]:
!pip install phonemizer

In [None]:
!pip install pyaudio
!pip install python-Levenshtein
!pip install requests ipywidgets sounddevice soundfile
!pip install sequence_align

In [None]:
!sudo apt-get install espeak-ng

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa
import soundfile as sf
import numpy as np

# Choose a model checkpoint
model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft" # Example: multilingual phoneme model

# Load the processor and model
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

In [None]:
audio_path = "/kaggle/input/audiotest/Recording.wav" # Replace with your audio file path

# Load audio file
speech, sample_rate = librosa.load(audio_path, sr=None)


# Resample to 16kHz if needed
if sample_rate != 16000:
    speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)

In [None]:
# Prepare input values for the model
# The `return_tensors="pt"` argument ensures PyTorch tensors are returned.
input_values = processor(speech, sampling_rate=16000, return_tensors="pt").input_values

In [None]:
with torch.no_grad():
    logits = model(input_values).logits

In [None]:
# Take argmax to get the most probable phoneme IDs
predicted_ids = torch.argmax(logits, dim=-1)

# Decode the predicted IDs to phoneme string
phoneme_transcription = processor.batch_decode(predicted_ids)

print(f"Phoneme Transcription: {phoneme_transcription}")

In [None]:
phoneme = " ".join(phoneme_transcription)
phoneme

In [None]:
import torch
import torchaudio
from transformers import AutoTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor, BitsAndBytesConfig
import librosa
import numpy as np
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "openai/whisper-small.en"

# Load processor
processor = AutoProcessor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)


model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
model.to(device)



In [None]:
# Alternative approach using Transformers pipeline (more robust)
from transformers import pipeline

def audio_to_text_pipeline(audio_file_path, chunk_length_s=30):
    """
    Convert audio file to text using Transformers pipeline (most reliable method)

    Args:
        audio_file_path (str): Path to the audio file
        chunk_length_s (int): Length of audio chunks in seconds

    Returns:
        str: Transcribed text
    """
    try:
        # Create a speech recognition pipeline
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model_id,
            device=0 if torch.cuda.is_available() else -1,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            chunk_length_s=chunk_length_s,
            return_timestamps=True  # Enable timestamps for better alignment
        )

        # Process the audio file
        result = pipe(audio_file_path)

        # Extract text from result
        if isinstance(result, dict):
            return result.get("text", "")
        elif isinstance(result, list):
            return " ".join([chunk.get("text", "") for chunk in result])
        else:
            return str(result)

    except Exception as e:
        print(f"Pipeline method failed: {e}")
        print("Falling back to manual method...")
        return audio_to_text_chunked(audio_file_path, chunk_length_s=chunk_length_s)

def audio_to_text_simple(audio_file_path):
    """
    Simple audio to text conversion using librosa and basic processing

    Args:
        audio_file_path (str): Path to the audio file

    Returns:
        str: Transcribed text
    """
    try:
        # Try pipeline method first (most reliable)
        return audio_to_text_pipeline(audio_file_path)
    except Exception as e:
        print(f"Pipeline failed, trying chunked method: {e}")
        try:
            # Fallback to chunked method
            return audio_to_text_chunked(audio_file_path)
        except Exception as e2:
            print(f"Chunked method failed: {e2}")
            # Final fallback
            return audio_to_text(audio_file_path)

In [None]:
transcript = audio_to_text_simple("/kaggle/input/audiotest/Recording.wav")

In [None]:
print(transcript)

In [None]:
import phonemizer
from phonemizer.punctuation import Punctuation
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator

def generate_reference_phoneme(reference_text):
    text = Punctuation(';:,.!"?()').remove(reference_text)
    ref_words = [w.lower() for w in text.strip().split(' ') if w]
    
    
    # initialize the espeak backend for English
    backend = EspeakBackend('en-us')
    
    # separate phones by a space and ignoring words boundaries
    separator = Separator(phone='', word=None)
    
    # build the lexicon by phonemizing each word one by one. The backend.phonemize
    # function expect a list as input and outputs a list.
    lexicon = [ (word, backend.phonemize([word], separator=separator, strip=True)[0])
        for word in ref_words]
    
    return lexicon, ref_words 

In [None]:
lexicon, ref_words = generate_reference_phoneme(transcript)
reference_phoneme =' '.join([phon for w, phon in lexicon])
reference_phoneme

In [None]:
from sequence_align.pairwise import hirschberg, needleman_wunsch
seq_a = reference_phoneme
seq_b = list(phoneme.replace(' ',''))

# recorded_phoneme['text']
aligned_seq_a, aligned_seq_b = needleman_wunsch(
    seq_a,
    seq_b,
    match_score=1.0,
    mismatch_score=-1.0,
    indel_score=-1.0,
    gap="_",
)
aligned_reference_seq = ''.join(aligned_seq_a)
aligned_recorded_seq = ''.join(aligned_seq_b)

print('Reference Text: ', transcript)
print('Reference Phoneme:',aligned_reference_seq)
print('Recorded Phoneme: ', aligned_recorded_seq)

In [None]:
import re

def find_word_start_positions(reference_sequence):
    # Split the sequence into words based on spaces
    words = reference_sequence.split()
    # Initialize a list to store the start positions
    start_positions = []
    # Initialize the current position
    current_position = 0
    # Iterate over the words
    for word in words:
        # Add the current position to the start positions list
        start_positions.append(current_position)
        # Increment the current position by the length of the word plus 1 (for the space)
        current_position += len(word) + 1
    return start_positions

def split_recorded_sequence(recorded_sequence, start_positions):
    # Initialize a list to store the split words
    split_words = []
    # Iterate over the start positions
    for i in range(len(start_positions)):
        # Get the start position
        start = start_positions[i]
        # If it's the last word, get the end position as the length of the sequence
        if i == len(start_positions) - 1:
            end = len(recorded_sequence)
        # Otherwise, get the end position as the start position of the next word
        else:
            end = start_positions[i + 1]
        # Extract the word from the recorded sequence
        word = recorded_sequence[start:end]
        # Add the word to the list
        split_words.append(word)
    return split_words
    
# recorded_sequence = "aɪ_hoːp_ðeɪ_hɛv_maɪ_fiːv__rədbrænd_aɪl_biː_bæk_su_n__tʊ_pliːz_w_iːdfoː__miː_"
ref_start_positions = find_word_start_positions(''.join(aligned_reference_seq))

# split recorded based on the reference start positions
rec_split_words = split_recorded_sequence(''.join(aligned_recorded_seq), ref_start_positions)
rec_split_words = [re.sub('( |\\_)$','',w) for w in rec_split_words]

# split ref based on the reference start positions
ref_split_words = split_recorded_sequence(''.join(aligned_reference_seq), ref_start_positions)
ref_split_words = [re.sub('(\\_| )$','',w) for w in ref_split_words]

# print('Reference Text: ',reference_text)
# print('(word, reference_phoneme, recorded_phoneme)',list(zip(ref_words, ref_split_words, rec_split_words)))
word_comparision_list = list(zip(ref_words, ref_split_words, rec_split_words))
word_comparision_list

In [None]:

max_length = max(len(w) for w, _, _ in word_comparision_list)

for w, ref_w, rec_w in word_comparision_list:
    word = f"\033[1m{w}\033[0m".ljust(max_length)
    if ref_w == rec_w:
        rec_string = f"\033[92m{rec_w}\033[0m".ljust(max_length)  # Green color
    else:
        mismatch_index = 0
        for i, (c1, c2) in enumerate(zip(ref_w, rec_w)):
            if c1 != c2:
                mismatch_index = i
                break
        rec_string = "{}\033[91m{}\033[0m{}".format(rec_w[:mismatch_index], rec_w[mismatch_index], rec_w[mismatch_index+1:]).ljust(max_length)

    print(word, ref_w, rec_string)

In [None]:
%%capture
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

!pip install unsloth


In [None]:
from unsloth import FastLanguageModel
import torch


fourbit_models = [
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit", # Qwen 14B 2x faster
    "unsloth/Qwen3-4B-unsloth-bnb-4bit",
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",
    "unsloth/Qwen3-32B-unsloth-bnb-4bit",

    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/Phi-4",
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit" # [NEW] We support TTS models!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen3-1.7B",
    max_seq_length = 10000,   # Context length - can be longer, but uses more memory
    load_in_4bit = False,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

In [None]:
irubic = {"Pronunciation": ["Band 1 : Can produce occasional individual words and phonemes that are recognisable, but no overall meaning is conveyed. Unintelligible.","Band 2 : Uses few acceptable phonological features (possibly because sample is insufficient). Overall problems with delivery impair attempts at connected speech. Individual words and phonemes are mainly mispronounced and little meaning is conveyed. Often unintelligible."
,"Band 3 : Displays some features of band 2, and some, but not all, of the positive features of band 4.", "Band 4 : Uses some acceptable phonological features, but the range is limited. Produces some acceptable chunking, but there are frequent lapses in overall rhythm. Attempts to use intonation and stress, but control is limited. Individual words or phonemes are frequently mispronounced, causing lack of clarity. Understanding requires some effort and there may be patches of speech that cannot be understood.",
"Band 5 : Displays all the positive features of band 4, and some, but not all, of the positive features of band 6.", "Band 6 : Uses a range of phonological features, but control is variable. Chunking is generally appropriate, but rhythm may be affected by a lack of stress-timing and/or a rapid speech rate. Some effective use of intonation and stress, but this is not sustained. Individual words or phonemes may be mispronounced but this causes only occasional lack of clarity. Can generally be understood throughout without much effort.",
"Band 7 : Displays all the positive features of band 6, and some, but not all, of the positive features of band 8.", "Band 8 : Uses a wide range of phonological features to convey precise and/or subtle meaning. Can sustain appropriate rhythm. Flexible use of stress and intonation across long utterances, despite occasional lapses. Can be easily understood throughout. Accent has minimal effect on intelligibility.", "Band 9 : Uses a full range of phonological features to convey precise and/or subtle meaning. Flexible use of features of connected speech is sustained throughout. Can be effortlessly understood throughout. Accent has no effect on intelligibility."
    ], "Grammatical range and accuracy" : [
    "Band 1 : No rateable languageunless memorised.",
    "Band 2 : No evidence of basic sentence forms.",
    "Band 3 : Basic sentence forms are attempted but grammatical errors are numerous except in apparently memorised utterances.",
    "Band 4 : Can produce basic sentence forms and some short utterances are error-free. Subordinate clauses are rare and, overall, turns are short, structures are repetitive and errors are frequent.",
    "Band 5 : Basic sentence forms are fairly well controlled for accuracy. Complex structures are attempted but these are limited in range, nearly always contain errors and may lead to the need for reformulation.",
    "Band 6 : Produces a mix of short and complex sentence forms and a variety of structures with limited flexibility. Though errors frequently occur in complex structures, these rarely impede communication.",
    "Band 7 : A range of structures flexibly used. Error-free sentences are frequent. Both simple and complex sentences are used effectively despite some errors. A few basic errors persist.",
    "Band 8 : Wide range of structures, flexibly used. The majority of sentences are error free. Occasional inappropriaciesand non-systematic errors occur. A few basic errors may persist.",
    "Band 9 : Structures are precise and accurate at all times, apart from ‘mistakes’ characteristic of native speaker speech."
    ], "Lexical resource" : [
    "Band 1 : No resource bar a few isolated words. No communication possible.",
    "Band 2 : Very limited resource. Utterances consist of isolated words or memorised utterances. Little communication possible without the support of mime or gesture.",
    "Band 3 : Resource limited to simple vocabulary used primarily to convey personal information. Vocabulary inadequate for unfamiliar topics.",
    "Band 4 : Resource sufficient for familiar topics but only basic meaning can be conveyed on unfamiliar topics. Frequent inappropriaciesand errors in word choice. Rarely attempts paraphrase.",
    "Band 5 : Resource sufficient to discuss familiar and unfamiliar topics but there is limited flexibility. Attempts paraphrase but not always with success.",
    "Band 6 : Resource sufficient to discuss topics at length. Vocabulary use may be inappropriate but meaning is clear. Generally able to paraphrase successfully.",
    "Band 7 : Resource flexibly used to discuss a variety of topics. Some ability to use less common and idiomatic items and an awareness of style and collocation is evident though inappropriaciesoccur. Effective use of paraphrase as required.",
    "Band 8 : Wide resource, readily and flexibly used to discuss all topics and convey precise meaning. Skilful use of less common and idiomatic items despite occasional inaccuracies in word choice and collocation. Effective use of paraphrase as required.",
    "Band 9 : Total flexibility and precise use in all contexts. Sustained use of accurate and idiomatic language."
    ]}


In [None]:
# Automated Qwen 3 Pronunciation Error Analysis for Each Word
import re
from transformers import TextStreamer


def analyze_pronunciation_with_qwen( transcript, alignments,rubic, model, tokenizer, max_new_tokens=32):
  """
  For each word, send a prompt to Qwen 3 with audio, transcript, expected and actual phonemes.
  Output: If a word has a phoneme error, output only the word and the single incorrect phoneme (no explanation). If the word is correct, output only '1'.
  """
  prompt = f"""
  You are a strict speaking evaluator for the IELTS exam. Given the following information:
  - Transcript: {transcript}
  - Phonemes : (word, expected phoneme, observed phoneme) : {word_comparision_list}
  give feedback on the pronounciation, Grammatical range and accuracy, and Lexical resource of the speaker, for each of the 3 cretiria output only a sentece of feedback, your output should be in the form of :
  Pronounciation : feedback (i.e you have a problem pronouncing X like in Y)
  Grammatical range and accuracy : feedback (do not mention anything about pronounciation here)
  Lexical resource of the speaker : feedback (do not mention anything about pronounciation here)
  Follow the IELTS speaking descriptors to give the speaker a very accurate feedback
  Pronounciation rubic is {rubic["Pronunciation"]}
  Grammatical range and accuracy is {rubic["Grammatical range and accuracy"]}
  Lexical resource is {rubic["Lexical resource"]}
         """
  messages = [{
      "role": "user",
      "content": prompt
  }]
  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      enable_thinking=False,
  )
  print(f"\n---\nAnalyzing words:")
  _ = model.generate(
      **tokenizer(text, return_tensors="pt").to("cuda"),
      max_new_tokens=max_new_tokens,
      temperature=0.7, top_p=0.8, top_k=20,
      streamer=TextStreamer(tokenizer, skip_prompt=True),
  )
# Example usage (replace with your actual variables):
analyze_pronunciation_with_qwen(
  transcript=transcript,
  alignments=word_comparision_list,
  rubic=irubic,
  model=model,
  tokenizer=tokenizer,
  max_new_tokens=10000 )

In [None]:
#torch.cuda.empty_cache()

In [None]:
#del model

In [None]:
!sudo apt-get install portaudio19-dev python3-dev

In [None]:
!pip install pyaudio
