In [41]:
import whisper
import pronouncing

# Function to transcribe audio using Whisper
def transcribe_audio(audio_file):
    model = whisper.load_model("medium")  # Model options: tiny, base, small, medium, large
    result = model.transcribe(audio_file)
    return result["text"]

# Function to convert text to phonemes using pronouncing
def text_to_phonemes(text):
    words = text.split()  # Split text into words
    phonemes = []
    
    for word in words:
        phones = pronouncing.phones_for_word(word)  # Get phonetic transcription for each word
        if phones:
            phonemes.append(phones[0])  # Append the first result if available
        else:
            phonemes.append(word)  # In case no phoneme is found, keep the original word

    return " ".join(phonemes)

# Path to your audio file
audio_file = "saq_1.mp3"  # Change this to your actual file

# Transcribe and convert to phonemes
transcription = transcribe_audio(audio_file)
phonemes = text_to_phonemes(transcription)

# Print results
print("\nTranscription:", transcription)
print("Phonetic Transcription:", phonemes)



Transcription:  A quick brown fox jumps over a lazy dog.
Phonetic Transcription: AH0 K W IH1 K B R AW1 N F AA1 K S JH AH1 M P S OW1 V ER0 AH0 L EY1 Z IY0 dog.


In [3]:
import whisper
import eng_to_ipa as ipa  
import warnings
warnings.filterwarnings('ignore')


def transcribe_audio(audio_file):
    model = whisper.load_model("medium")  
    result = model.transcribe(audio_file)
    return result["text"]


def text_to_ipa(text):
    words = text.split()  
    ipa_transcription = []
    
    
    for word in words:
        ipa_word = ipa.convert(word)  
        ipa_transcription.append(ipa_word)
    
    return " ".join(ipa_transcription)


audio_file = "saq_2.mp3"  


transcription = transcribe_audio(audio_file)
ipa_phonemes = text_to_ipa(transcription)


print("\nTranscription:", transcription)
print("IPA Phonetic Transcription:", ipa_phonemes)



Transcription:  A quick brown fox jumps over a lazy dog.
IPA Phonetic Transcription: ə kwɪk braʊn fɑks ʤəmps ˈoʊvər ə ˈleɪzi dɔg.


In [7]:
import whisper
import eng_to_ipa as ipa
import warnings
from difflib import SequenceMatcher
from Levenshtein import distance as levenshtein_distance

warnings.filterwarnings('ignore')


def transcribe_audio(audio_file):
    model = whisper.load_model("medium")  
    result = model.transcribe(audio_file)
    return result["text"]


def text_to_ipa(text):
    words = text.split()  
    ipa_transcription = [ipa.convert(word) for word in words]
    return " ".join(ipa_transcription)


def simplify_ipa(ipa_text):
    ipa_mapping = {
        "æ": "a", "ɑ": "ah", "ɒ": "aw", "ɔ": "or", "ə": "uh", "ɛ": "eh", "ɜ": "ur",
        "ɪ": "ih", "i": "ee", "ʊ": "oo", "u": "oo", "ʌ": "uh", 
        "ʒ": "zh", "ʃ": "sh", "θ": "th", "ð": "dh", "ŋ": "ng", "ɹ": "r", 
        "ʤ": "j", "ʧ": "ch", "ɡ": "g", "dʒ": "j", "tʃ": "ch",
        "aʊ": "ow", "oʊ": "oh", "eɪ": "ay", "aɪ": "eye",
        "ˈ": "", "ˌ": "", "ː": ""
    }
    
    for ipa_char, simple_char in ipa_mapping.items():
        ipa_text = ipa_text.replace(ipa_char, simple_char)

    return ipa_text


def compare_phonetics(phonetics1, phonetics2):
    ratio = SequenceMatcher(None, phonetics1, phonetics2).ratio()  
    lev_dist = levenshtein_distance(phonetics1, phonetics2)  
    
    print("\n🔍 **Phonetic Comparison Results** 🔍")
    print(f"🔹 **Similarity Ratio:** {ratio:.2%}")  
    print(f"🔹 **Levenshtein Distance:** {lev_dist} changes required")
    
    # Highlight differences
    print("\n🔹 **Differences Highlighted:**")
    for word1, word2 in zip(phonetics1.split(), phonetics2.split()):
        if word1 != word2:
            print(f"❌ {word1}  →  ✅ {word2}")


audio_file1 = "saq_1.mp3"
audio_file2 = "aaq_1.mp3"

# Transcribe and convert to IPA
transcription1 = transcribe_audio(audio_file1)
ipa_phonemes1 = text_to_ipa(transcription1)
simplified_phonetics1 = simplify_ipa(ipa_phonemes1)

transcription2 = transcribe_audio(audio_file2)
ipa_phonemes2 = text_to_ipa(transcription2)
simplified_phonetics2 = simplify_ipa(ipa_phonemes2)

# Print results
print("\n🎙 **Audio 1 Transcription:**", transcription1)
print("🔠 **IPA Phonetic Transcription:**", ipa_phonemes1)
print("📝 **Simplified Phonetic Transcription:**", simplified_phonetics1)

print("\n🎙 **Audio 2 Transcription:**", transcription2)
print("🔠 **IPA Phonetic Transcription:**", ipa_phonemes2)
print("📝 **Simplified Phonetic Transcription:**", simplified_phonetics2)


compare_phonetics(simplified_phonetics1, simplified_phonetics2)



🎙 **Audio 1 Transcription:**  A quick brown fox jumps over a lazy dog.
🔠 **IPA Phonetic Transcription:** ə kwɪk braʊn fɑks ʤəmps ˈoʊvər ə ˈleɪzi dɔg.
📝 **Simplified Phonetic Transcription:** ooh kweehk braoon fahks joohmps ooovoohr ooh leeehzee dorg.

🎙 **Audio 2 Transcription:**  The quick brown fox jumps over the lazy dog.
🔠 **IPA Phonetic Transcription:** ðə kwɪk braʊn fɑks ʤəmps ˈoʊvər ðə ˈleɪzi dɔg.
📝 **Simplified Phonetic Transcription:** dhooh kweehk braoon fahks joohmps ooovoohr dhooh leeehzee dorg.

🔍 **Phonetic Comparison Results** 🔍
🔹 **Similarity Ratio:** 96.72%
🔹 **Levenshtein Distance:** 4 changes required

🔹 **Differences Highlighted:**
❌ ooh  →  ✅ dhooh
❌ ooh  →  ✅ dhooh


In [None]:
# from speechbrain.inference.ASR import EncoderDecoderASR

# asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-en", savedir="temp")
# asr_model.transcribe_file("saq_1.mp3")

In [6]:
import whisper
import jieba
from pypinyin import pinyin, Style
import warnings
from difflib import SequenceMatcher
from Levenshtein import distance as levenshtein_distance

warnings.filterwarnings('ignore')

def transcribe_audio(audio_file):
    model = whisper.load_model("medium")
    result = model.transcribe(audio_file, language="zh")
    return result["text"]

def segment_mandarin(text):
    return " ".join(jieba.cut(text))

def text_to_pinyin(text):
    words = segment_mandarin(text)
    pinyin_transcription = pinyin(words, style=Style.TONE3, heteronym=False)
    return " ".join([syllable[0] for syllable in pinyin_transcription])

def simplify_pinyin(pinyin_text):
    pinyin_mapping = {
        "1": "1", "2": "2", "3": "3", "4": "4", "5": "",
        "ā": "a1", "á": "a2", "ǎ": "a3", "à": "a4",
        "ē": "e1", "é": "e2", "ě": "e3", "è": "e4",
        "ī": "i1", "í": "i2", "ǐ": "i3", "ì": "i4",
        "ō": "o1", "ó": "o2", "ǒ": "o3", "ò": "o4",
        "ū": "u1", "ú": "u2", "ǔ": "u3", "ù": "u4",
        "ǖ": "ü1", "ǘ": "ü2", "ǚ": "ü3", "ǜ": "ü4"
    }
    for pinyin_char, simple_char in pinyin_mapping.items():
        pinyin_text = pinyin_text.replace(pinyin_char, simple_char)
    return pinyin_text

def compare_phonetics(phonetics1, phonetics2):
    ratio = SequenceMatcher(None, phonetics1, phonetics2).ratio()
    lev_dist = levenshtein_distance(phonetics1, phonetics2)
    print("\n🔍 **Phonetic Comparison Results** 🔍")
    print(f"🔹 **Similarity Ratio:** {ratio:.2%}")
    print(f"🔹 **Levenshtein Distance:** {lev_dist} changes required")
    print("\n🔹 **Differences Highlighted:**")
    for word1, word2 in zip(phonetics1.split(), phonetics2.split()):
        if word1 != word2:
            print(f"❌ {word1}  →  ✅ {word2}")

audio_file1 = "aud_1.wav"
audio_file2 = "saq_ch.mp3"

transcription1 = transcribe_audio(audio_file1)
pinyin_phonetics1 = text_to_pinyin(transcription1)
simplified_phonetics1 = simplify_pinyin(pinyin_phonetics1)

transcription2 = transcribe_audio(audio_file2)
pinyin_phonetics2 = text_to_pinyin(transcription2)
simplified_phonetics2 = simplify_pinyin(pinyin_phonetics2)

print("\n🎙 **Audio 1 Transcription:**", transcription1)
print("🔠 **Pinyin Phonetic Transcription:**", pinyin_phonetics1)
print("📝 **Simplified Phonetic Transcription:**", simplified_phonetics1)

print("\n🎙 **Audio 2 Transcription:**", transcription2)
print("🔠 **Pinyin Phonetic Transcription:**", pinyin_phonetics2)
print("📝 **Simplified Phonetic Transcription:**", simplified_phonetics2)

compare_phonetics(simplified_phonetics1, simplified_phonetics2)



🎙 **Audio 1 Transcription:** 晚上好,今天過得怎麼樣?
🔠 **Pinyin Phonetic Transcription:** wan3 shang4   hao3  ,  jin1 tian1   guo4 de2   zen3 me yang4  ?
📝 **Simplified Phonetic Transcription:** wan3 shang4   hao3  ,  jin1 tian1   guo4 de2   zen3 me yang4  ?

🎙 **Audio 2 Transcription:** 晚上好,今天过得人美样。
🔠 **Pinyin Phonetic Transcription:** wan3 shang4   hao3  ,  jin1 tian1   guo4 de2   ren2 mei3 yang4  。
📝 **Simplified Phonetic Transcription:** wan3 shang4   hao3  ,  jin1 tian1   guo4 de2   ren2 mei3 yang4  。

🔍 **Phonetic Comparison Results** 🔍
🔹 **Similarity Ratio:** 93.75%
🔹 **Levenshtein Distance:** 5 changes required

🔹 **Differences Highlighted:**
❌ zen3  →  ✅ ren2
❌ me  →  ✅ mei3
❌ ?  →  ✅ 。
