In [None]:
%pip install simalign



In [None]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm


Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and inst

In [None]:
import json
import pandas as pd
from simalign import SentenceAligner
import spacy
from transformers import MarianMTModel, MarianTokenizer

# === Load SpaCy models ===
nlp_de = spacy.load("de_core_news_sm")  # German lemmatizer
nlp_en = spacy.load("en_core_web_sm")   # English lemmatizer

# === Load dictionary JSON (German → English mappings) ===
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/german_english.json"

with open(path, "r", encoding="utf-8") as f:
    vocab_dict = json.load(f)


# === MarianMT Models ===
de_en_model = "Helsinki-NLP/opus-mt-de-en"
en_de_model = "Helsinki-NLP/opus-mt-en-de"

tokenizer_de_en = MarianTokenizer.from_pretrained(de_en_model)
translator_de_en = MarianMTModel.from_pretrained(de_en_model)

tokenizer_en_de = MarianTokenizer.from_pretrained(en_de_model)
translator_en_de = MarianMTModel.from_pretrained(en_de_model)

# === SimAlign ===
aligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

# === Utility functions ===
def normalize_word(word, lang="de"):
    """Lemmatize word for dictionary lookup"""
    nlp = nlp_de if lang == "de" else nlp_en
    doc = nlp(word)
    return doc[0].lemma_

def get_literal_and_root(word, lang="de"):
    """Return lemma + dictionary meaning"""
    lemma = normalize_word(word, lang)
    if lang == "de":
        literal = vocab_dict.get(lemma, "N/A")  # German → English
    else:
        # reverse lookup for English → German
        reverse_dict = {v: k for k, v in vocab_dict.items()}
        literal = reverse_dict.get(lemma, "N/A")
    return lemma, literal

def translate_sentence(sentence, src_lang="de"):
    """Translate using MarianMT"""
    if src_lang == "de":
        inputs = tokenizer_de_en(sentence, return_tensors="pt", padding=True, truncation=True)
        translated = translator_de_en.generate(**inputs)
        return tokenizer_de_en.decode(translated[0], skip_special_tokens=True)
    else:
        inputs = tokenizer_en_de(sentence, return_tensors="pt", padding=True, truncation=True)
        translated = translator_en_de.generate(**inputs)
        return tokenizer_en_de.decode(translated[0], skip_special_tokens=True)

def build_vocab_chart(src_sentence_str, src_lang="de"):
    """Build vocab chart for either German→English or English→German"""
    nlp = nlp_de if src_lang == "de" else nlp_en
    src_sentence = [token.text for token in nlp(src_sentence_str)]

    trg_sentence_str = translate_sentence(src_sentence_str, src_lang=src_lang)
    trg_sentence = trg_sentence_str.split()

    alignments = aligner.get_word_aligns(src_sentence, trg_sentence)
    aligned_pairs = alignments["itermax"]

    vocab_chart = []
    seen = []
    for src_idx, trg_idx in aligned_pairs:
        src_word = src_sentence[src_idx]
        if not src_word.isalpha() or src_word in seen:
            continue

        context_meaning = trg_sentence[trg_idx]
        root_word, literal_meaning = get_literal_and_root(src_word, lang=src_lang)

        vocab_chart.append({
            f"{'German' if src_lang=='de' else 'English'} Word": src_word,
            "Context Meaning": context_meaning,
            "Root Word": root_word,
            "Literal Meaning": literal_meaning
        })
        seen.append(src_word)

    return src_sentence_str, trg_sentence_str, pd.DataFrame(vocab_chart)


# === Example German → English ===
src_de = "Frau Meier geht heute in den Supermarkt."
german, english, df_de_en = build_vocab_chart(src_de, src_lang="de")

print("=== DE → EN ===")
print("German:  ", german)
print("English: ", english)
print(df_de_en.to_string(index=False))

# === Example English → German ===
#src_en = "Mrs. Meier is going to the supermarket today."
english, german, df_en_de = build_vocab_chart(english, src_lang="en")

print("\n=== EN → DE ===")
print("English: ", english)
print("German:  ", german)
print(df_en_de.to_string(index=False))


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/298M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

2025-10-23 14:27:45,625 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased
INFO:simalign.simalign:Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


=== DE → EN ===
German:   Frau Meier geht heute in den Supermarkt.
English:  Mrs. Meier is going to the supermarket today.
German Word Context Meaning  Root Word Literal Meaning
       Frau            Mrs.       Frau    woman (irr.)
      Meier           Meier      Meier             N/A
       geht              is      gehen       go (irr.)
      heute          today.      heute           today
         in              to         in              in
        den             the        der             the
 Supermarkt     supermarket Supermarkt     supermarket

=== EN → DE ===
English:  Mrs. Meier is going to the supermarket today.
German:   Mrs. Meier geht heute in den Supermarkt.
English Word Context Meaning   Root Word Literal Meaning
       Meier           Meier       meier             N/A
          is            geht          be             N/A
          to              in          to              zu
         the             den         the             die
 supermarket     Supermarkt.

In [13]:
import spacy
import pandas as pd

# Load German spaCy model
nlp = spacy.load("de_core_news_sm")

# Expanded mapping dictionary for abbreviations → full forms
morph_map = {
    "Nom": "Nominative (Subject)",
    "Acc": "Accusative (Direct Object)",
    "Dat": "Dative (Indirect Object)",
    "Gen": "Genitive (Possessive)",
    "Sing": "Singular",
    "Plur": "Plural",
    "Masc": "Masculine",
    "Fem": "Feminine",
    "Neut": "Neuter",
    "1": "1st Person",
    "2": "2nd Person",
    "3": "3rd Person",
    "Ind": "Indicative",
    "Subj": "Subjunctive",
    "Imp": "Imperative",
    "Pres": "Present",
    "Past": "Past",
    "Fut": "Future",
    "Prs": "Personal Pronoun",
    "Rel": "Relative Pronoun",
    "Dem": "Demonstrative Pronoun",
    "Int": "Interrogative Pronoun",
    "Art": "Article",
    "Neg": "Negative Pronoun",
    "Fin": "Finite Verb",
    "Inf": "Infinitive",
    "Part": "Participle",
    "Sup": "Supine"
}

def expand_morphology(morph_dict):
    """Expand morphological abbreviations into full forms"""
    expanded = []
    for k, v in morph_dict.items():
        expanded.append(f"{k}: {morph_map.get(v, v)}")
    return ", ".join(expanded)

def pos_morphology_chart(sentence):
    """Generate POS + Morphology chart for a German sentence"""
    doc = nlp(sentence)
    rows = []
    for token in doc:
        if token.is_alpha:  # skip punctuation
            morph = token.morph.to_dict()
            expanded_morph = expand_morphology(morph)
            rows.append({
                "Word": token.text,
                "Lemma": token.lemma_,
                "POS": token.pos_,
                "Morphology": expanded_morph if expanded_morph else "—"
            })
    return pd.DataFrame(rows)

# Example

df = pos_morphology_chart(src_de)

print(df.to_string(index=False))


      Word      Lemma   POS                                                                                              Morphology
      Frau       Frau  NOUN                                          Case: Nominative (Subject), Gender: Feminine, Number: Singular
     Meier      Meier PROPN                                          Case: Nominative (Subject), Gender: Feminine, Number: Singular
      geht      gehen  VERB           Mood: Indicative, Number: Singular, Person: 3rd Person, Tense: Present, VerbForm: Finite Verb
     heute      heute   ADV                                                                                                       —
        in         in   ADP                                                                                                       —
       den        der   DET Case: Accusative (Direct Object), Definite: Def, Gender: Masculine, Number: Singular, PronType: Article
Supermarkt Supermarkt  NOUN                                   Case: Accusati

In [14]:
pip install phonemizer epitran gTTS


Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting epitran
  Downloading epitran-1.34.0-py3-none-any.whl.metadata (36 kB)
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting segments (from phonemizer)
  Downloading segments-2.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-2.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting panphon>=0.20 (from epitran)
  Downloading panphon-0.22.2-py2.py3-none-any.whl.metadata (15 kB)
Collecting jamo (from epitran)
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting un

In [19]:
!pip install git+https://github.com/openai/whisper.git
!apt install ffmpeg -y


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-29fce2w3
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-29fce2w3
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=e928c9bd0249463210d96380a0692317a83cc9bb97acaace769ef5b074a48ae3
  Stored in directory: /tmp/pip-ephem-wheel-cache-ur8ad89x/wheels/c3/03/25/5e0ba78bc27a3a089f137c9f1d92fdfce16d06996c071a016c
Successfully built openai-whisper
Installing collec

In [53]:
from google.colab import files
uploaded = files.upload()


Saving Universität.m4a to Universität (2).m4a


In [55]:
import whisper

model = whisper.load_model("tiny")  # you can use "base" or "small" for higher accuracy
file_name = "Universität.m4a"  # change this to your uploaded file name

target_audio = "Universität"

result = model.transcribe(file_name, language="de")  # "en" for English
spoken_text = result["text"].strip()

print("🗣️ Recognized speech:", spoken_text)




🗣️ Recognized speech: Universität


In [56]:
from phonemizer import phonemize
import epitran
from difflib import SequenceMatcher

epi_de = epitran.Epitran("deu-Latn")
epi_en = epitran.Epitran("eng-Latn")

def get_pronunciation(word, lang="de"):
    try:
        ipa = epi_de.transliterate(word) if lang == "de" else epi_en.transliterate(word)
        if not ipa.strip():
            ipa = phonemize(
                word,
                language="de" if lang == "de" else "en-us",
                backend="espeak",
                strip=True,
                preserve_punctuation=True
            )
        return ipa
    except Exception as e:
        return f"Error: {e}"

def compare_pronunciation(spoken_word, target_word, lang="de"):
    ipa_spoken = get_pronunciation(spoken_word, lang)
    ipa_target = get_pronunciation(target_word, lang)

    print(f"\n🎯 Target: {target_word} → {ipa_target}")
    print(f"🗣 You said: {spoken_word} → {ipa_spoken}")

    score = SequenceMatcher(None, ipa_spoken, ipa_target).ratio()
    print(f"✅ Pronunciation similarity: {score*100:.1f}%")

compare_pronunciation(spoken_text,target_audio, lang="de")



🎯 Target: Universität → uːniːfrziːtɛt
🗣 You said: Universität → uːniːfrziːtɛt
✅ Pronunciation similarity: 100.0%


In [33]:
!apt update
!apt install -y espeak-ng


[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 14.2 kB/129[0m                                                                               Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 116 kB/129 [0m[33m0% [Waiting for headers] [Connected to cloud.r-project.org (3.171.85.81)] [Conn[0m                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
[33m0% [Waiting for headers] [Connected to cloud.r-project.org (3.171.85.81)] [Conn[0m[33m0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.[0m                                                                               Get:4 https://cloud.r-project.org/b

In [57]:
from gtts import gTTS
from IPython.display import Audio

tts = gTTS(target_audio, lang="de")
tts.save("correct_pronunciation.mp3")
Audio("correct_pronunciation.mp3", autoplay=True)


In [58]:
from IPython.display import Audio
!espeak-ng -v de+f3 -s 100 -w output.wav "{target_audio}"
from IPython.display import Audio
Audio("output.wav", autoplay=True)
