In [6]:
%pip install simalign

Collecting simalign
  Downloading simalign-0.4-py3-none-any.whl.metadata (6.2 kB)
Downloading simalign-0.4-py3-none-any.whl (8.1 kB)
Installing collected packages: simalign
Successfully installed simalign-0.4


In [7]:
!pip install transformers sentencepiece torch --quiet

In [8]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm


Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [10]:
import json
import pandas as pd
from simalign import SentenceAligner
import spacy
from transformers import MarianMTModel, MarianTokenizer

# === Load SpaCy models ===
nlp_de = spacy.load("de_core_news_sm")  # German lemmatizer
nlp_en = spacy.load("en_core_web_sm")   # English lemmatizer

# === Load dictionary JSON (German → English mappings) ===
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/german_english.json"

with open(path, "r", encoding="utf-8") as f:
    vocab_dict = json.load(f)


# === MarianMT Models ===
de_en_model = "Helsinki-NLP/opus-mt-de-en"
en_de_model = "Helsinki-NLP/opus-mt-en-de"

tokenizer_de_en = MarianTokenizer.from_pretrained(de_en_model)
translator_de_en = MarianMTModel.from_pretrained(de_en_model)

tokenizer_en_de = MarianTokenizer.from_pretrained(en_de_model)
translator_en_de = MarianMTModel.from_pretrained(en_de_model)

# === SimAlign ===
aligner = SentenceAligner(model="bert", token_type="bpe", matching_methods="mai")

# === Utility functions ===
def normalize_word(word, lang="de"):
    """Lemmatize word for dictionary lookup"""
    nlp = nlp_de if lang == "de" else nlp_en
    doc = nlp(word)
    return doc[0].lemma_

def get_literal_and_root(word, lang="de"):
    """Return lemma + dictionary meaning"""
    lemma = normalize_word(word, lang)
    if lang == "de":
        literal = vocab_dict.get(lemma, "N/A")  # German → English
    else:
        # reverse lookup for English → German
        reverse_dict = {v: k for k, v in vocab_dict.items()}
        literal = reverse_dict.get(lemma, "N/A")
    return lemma, literal

def translate_sentence(sentence, src_lang="de"):
    """Translate using MarianMT"""
    if src_lang == "de":
        inputs = tokenizer_de_en(sentence, return_tensors="pt", padding=True, truncation=True)
        translated = translator_de_en.generate(**inputs)
        return tokenizer_de_en.decode(translated[0], skip_special_tokens=True)
    else:
        inputs = tokenizer_en_de(sentence, return_tensors="pt", padding=True, truncation=True)
        translated = translator_en_de.generate(**inputs)
        return tokenizer_en_de.decode(translated[0], skip_special_tokens=True)

def build_vocab_chart(src_sentence_str, src_lang="de"):
    """Build vocab chart for either German→English or English→German"""
    nlp = nlp_de if src_lang == "de" else nlp_en
    src_sentence = [token.text for token in nlp(src_sentence_str)]

    trg_sentence_str = translate_sentence(src_sentence_str, src_lang=src_lang)
    trg_sentence = trg_sentence_str.split()

    alignments = aligner.get_word_aligns(src_sentence, trg_sentence)
    aligned_pairs = alignments["itermax"]

    vocab_chart = []
    seen = []
    for src_idx, trg_idx in aligned_pairs:
        src_word = src_sentence[src_idx]
        if not src_word.isalpha() or src_word in seen:
            continue

        context_meaning = trg_sentence[trg_idx]
        root_word, literal_meaning = get_literal_and_root(src_word, lang=src_lang)

        vocab_chart.append({
            f"{'German' if src_lang=='de' else 'English'} Word": src_word,
            "Context Meaning": context_meaning,
            "Root Word": root_word,
            "Literal Meaning": literal_meaning
        })
        seen.append(src_word)

    return src_sentence_str, trg_sentence_str, pd.DataFrame(vocab_chart)


# === Example German → English ===
src_de = "Frau Meier geht heute in den Supermarkt."
german, english, df_de_en = build_vocab_chart(src_de, src_lang="de")

print("=== DE → EN ===")
print("German:  ", german)
print("English: ", english)
print(df_de_en.to_string(index=False))

# === Example English → German ===
#src_en = "Mrs. Meier is going to the supermarket today."
english, german, df_en_de = build_vocab_chart(english, src_lang="en")

print("\n=== EN → DE ===")
print("English: ", english)
print("German:  ", german)
print(df_en_de.to_string(index=False))


Mounted at /content/drive




config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

2025-10-24 17:41:43,081 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased
INFO:simalign.simalign:Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


=== DE → EN ===
German:   Frau Meier geht heute in den Supermarkt.
English:  Mrs. Meier is going to the supermarket today.
German Word Context Meaning  Root Word Literal Meaning
       Frau            Mrs.       Frau    woman (irr.)
      Meier           Meier      Meier             N/A
       geht              is      gehen       go (irr.)
      heute          today.      heute           today
         in              to         in              in
        den             the        der             the
 Supermarkt     supermarket Supermarkt     supermarket

=== EN → DE ===
English:  Mrs. Meier is going to the supermarket today.
German:   Mrs. Meier geht heute in den Supermarkt.
English Word Context Meaning   Root Word Literal Meaning
       Meier           Meier       meier             N/A
          is            geht          be             N/A
          to              in          to              zu
         the             den         the             die
 supermarket     Supermarkt.

In [11]:
import spacy
import pandas as pd

# Load German spaCy model
nlp = spacy.load("de_core_news_sm")

# Expanded mapping dictionary for abbreviations → full forms
morph_map = {
    "Nom": "Nominative (Subject)",
    "Acc": "Accusative (Direct Object)",
    "Dat": "Dative (Indirect Object)",
    "Gen": "Genitive (Possessive)",
    "Sing": "Singular",
    "Plur": "Plural",
    "Masc": "Masculine",
    "Fem": "Feminine",
    "Neut": "Neuter",
    "1": "1st Person",
    "2": "2nd Person",
    "3": "3rd Person",
    "Ind": "Indicative",
    "Subj": "Subjunctive",
    "Imp": "Imperative",
    "Pres": "Present",
    "Past": "Past",
    "Fut": "Future",
    "Prs": "Personal Pronoun",
    "Rel": "Relative Pronoun",
    "Dem": "Demonstrative Pronoun",
    "Int": "Interrogative Pronoun",
    "Art": "Article",
    "Neg": "Negative Pronoun",
    "Fin": "Finite Verb",
    "Inf": "Infinitive",
    "Part": "Participle",
    "Sup": "Supine"
}

def expand_morphology(morph_dict):
    """Expand morphological abbreviations into full forms"""
    expanded = []
    for k, v in morph_dict.items():
        expanded.append(f"{k}: {morph_map.get(v, v)}")
    return ", ".join(expanded)

def pos_morphology_chart(sentence):
    """Generate POS + Morphology chart for a German sentence"""
    doc = nlp(sentence)
    rows = []
    for token in doc:
        if token.is_alpha:  # skip punctuation
            morph = token.morph.to_dict()
            expanded_morph = expand_morphology(morph)
            rows.append({
                "Word": token.text,
                "Lemma": token.lemma_,
                "POS": token.pos_,
                "Morphology": expanded_morph if expanded_morph else "—"
            })
    return pd.DataFrame(rows)

# Example

df = pos_morphology_chart(src_de)

print(df.to_string(index=False))


      Word      Lemma   POS                                                                                              Morphology
      Frau       Frau  NOUN                                          Case: Nominative (Subject), Gender: Feminine, Number: Singular
     Meier      Meier PROPN                                          Case: Nominative (Subject), Gender: Feminine, Number: Singular
      geht      gehen  VERB           Mood: Indicative, Number: Singular, Person: 3rd Person, Tense: Present, VerbForm: Finite Verb
     heute      heute   ADV                                                                                                       —
        in         in   ADP                                                                                                       —
       den        der   DET Case: Accusative (Direct Object), Definite: Def, Gender: Masculine, Number: Singular, PronType: Article
Supermarkt Supermarkt  NOUN                                   Case: Accusati

In [12]:
!pip install phonemizer epitran gTTS


Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting epitran
  Downloading epitran-1.34.0-py3-none-any.whl.metadata (36 kB)
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting segments (from phonemizer)
  Downloading segments-2.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-2.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting panphon>=0.20 (from epitran)
  Downloading panphon-0.22.2-py2.py3-none-any.whl.metadata (15 kB)
Collecting jamo (from epitran)
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting un

In [13]:
!pip install git+https://github.com/openai/whisper.git
!apt install ffmpeg -y


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-_np4dpgu
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-_np4dpgu
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=67982cdc15d6dedbb31b159e96fc101540bc373ff3d4cc50ecec3609c73bf327
  Stored in directory: /tmp/pip-ephem-wheel-cache-v5lluzpr/wheels/c3/03/25/5e0ba78bc27a3a089f137c9f1d92fdfce16d06996c071a016c
Successfully built openai-whisper
Installing collec

In [16]:
from google.colab import files
uploaded = files.upload()


Saving Universität.m4a to Universität.m4a


In [17]:
import whisper

model = whisper.load_model("tiny")  # you can use "base" or "small" for higher accuracy
file_name = "Universität.m4a"  # change this to your uploaded file name

target_audio = "Universität"

result = model.transcribe(file_name, language="de")  # "en" for English
spoken_text = result["text"].strip()

print("🗣️ Recognized speech:", spoken_text)




🗣️ Recognized speech: Universität


In [18]:
from phonemizer import phonemize
import epitran
from difflib import SequenceMatcher

epi_de = epitran.Epitran("deu-Latn")
epi_en = epitran.Epitran("eng-Latn")

def get_pronunciation(word, lang="de"):
    try:
        ipa = epi_de.transliterate(word) if lang == "de" else epi_en.transliterate(word)
        if not ipa.strip():
            ipa = phonemize(
                word,
                language="de" if lang == "de" else "en-us",
                backend="espeak",
                strip=True,
                preserve_punctuation=True
            )
        return ipa
    except Exception as e:
        return f"Error: {e}"

def compare_pronunciation(spoken_word, target_word, lang="de"):
    ipa_spoken = get_pronunciation(spoken_word, lang)
    ipa_target = get_pronunciation(target_word, lang)

    print(f"\n🎯 Target: {target_word} → {ipa_target}")
    print(f"🗣 You said: {spoken_word} → {ipa_spoken}")

    score = SequenceMatcher(None, ipa_spoken, ipa_target).ratio()
    print(f"✅ Pronunciation similarity: {score*100:.1f}%")

compare_pronunciation(spoken_text,target_audio, lang="de")



🎯 Target: Universität → uːniːfrziːtɛt
🗣 You said: Universität → uːniːfrziːtɛt
✅ Pronunciation similarity: 100.0%


In [19]:
!apt update
!apt install -y espeak-ng


Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,816 kB]
Get:13 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,594 kB]
Get:14 https://r

In [20]:
from gtts import gTTS
from IPython.display import Audio

tts = gTTS(target_audio, lang="de")
tts.save("correct_pronunciation.mp3")
Audio("correct_pronunciation.mp3", autoplay=True)


In [21]:
from IPython.display import Audio
!espeak-ng -v de+f3 -s 100 -w output.wav "{target_audio}"
from IPython.display import Audio
Audio("output.wav", autoplay=True)


In [27]:
# === German Question Generation via Translation + FLAN-T5 ===


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

from transformers import MarianMTModel, MarianTokenizer

de_en_model = "Helsinki-NLP/opus-mt-de-en"
en_de_model = "Helsinki-NLP/opus-mt-en-de"

tokenizer_de_en = MarianTokenizer.from_pretrained(de_en_model)
translator_de_en = MarianMTModel.from_pretrained(de_en_model)

tokenizer_en_de = MarianTokenizer.from_pretrained(en_de_model)
translator_en_de = MarianMTModel.from_pretrained(en_de_model)

# FLAN-T5 (question generation in English)
qg_model_name = "google/flan-t5-base"
qg_tokenizer = AutoTokenizer.from_pretrained(qg_model_name)
qg_model = AutoModelForSeq2SeqLM.from_pretrained(qg_model_name)


# ---- Helper functions ----
def translate_de_to_en(text):
    inputs = tokenizer_de_en(text, return_tensors="pt", padding=True, truncation=True)
    outputs = translator_de_en.generate(**inputs)
    return tokenizer_de_en.decode(outputs[0], skip_special_tokens=True)

def translate_en_to_de(text):
    inputs = tokenizer_en_de(text, return_tensors="pt", padding=True, truncation=True)
    outputs = translator_en_de.generate(**inputs)
    return tokenizer_en_de.decode(outputs[0], skip_special_tokens=True)

def generate_questions_en(text, num_questions=3):
    prompt = f"Generate {num_questions} comprehension questions in English about the following text:\n{text}\nQuestions:"
    inputs = qg_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    outputs = qg_model.generate(
        **inputs,
        max_length=128,
        num_return_sequences=num_questions,
        do_sample=True,
        temperature=0.8,
        top_p=0.9
    )
    return [qg_tokenizer.decode(o, skip_special_tokens=True).strip() for o in outputs]


def generate_german_questions(paragraph, num_questions=3):
    # Translate German → English
    english_text = translate_de_to_en(paragraph)

    # Generate English questions
    english_questions = generate_questions_en(english_text, num_questions=num_questions)

    # Translate back to German
    german_questions = [translate_en_to_de(q) for q in english_questions]
    return list(zip(english_questions, german_questions))



paragraph = """
Heute ist der erste Schultag. Lena steht mit ihrer Schultüte vor der Schule. Sandra, Susanne und Paul sind auch da. Die Kinder kennen sich aus dem Kindergarten. Jetzt gehen sie in die gleiche Klasse. Sie freuen sich schon auf den Unterricht. Lena freut sich besonders auf das Rechnen. Sandra und Susanne aufs Schreiben. Und Paul? Paul sagt, er freut sich besonders auf die Pausen.
In der Klasse lernen sie ihren Lehrer, Herrn Mayer, kennen. Herr Mayer ist noch sehr jung und lustig. In der ersten Stunde lernen die Kinder das ABC-Lied. Alle singen begeistert mit.
Danach schreibt der Lehrer die ersten Buchstaben an die Tafel: A wie Affe, B wie Banane. Herr Mayer zeichnet einen Affen dazu, der einen Banane frisst. Die Kinder lachen laut. Dann läutet auch schon die Schulglocke.
Der erste Tag in der Schule ist vorbei. Vor der Schule warten die Eltern auf die Kinder. Die Kinder erzählen vom ersten Tag. Sie freuen sich schon auf morgen
"""

results = generate_german_questions(paragraph, num_questions=10)

print("*******Fragen:*******\n")
for i, (en_q, de_q) in enumerate(results, 1):
    print(f"{i}. 🇩🇪: {de_q}\n   🇬🇧: {en_q}\n")


*******Fragen:*******

1. 🇩🇪: Wie heißen die Kinder?
   🇬🇧: What are the names of the children?

2. 🇩🇪: Wie viele Schüler gibt es in der Klasse?
   🇬🇧: How many students are there in the class?

3. 🇩🇪: Was ist der erste Schultag?
   🇬🇧: What is the first day of school?

4. 🇩🇪: Wie heißt die Schule?
   🇬🇧: What is the name of the school?

5. 🇩🇪: Wie heißt der Lehrer?
   🇬🇧: What is the name of the teacher?

6. 🇩🇪: Was sind die ersten Dinge, die die Kinder in der Klasse lernen?
   🇬🇧: What are the first things the children learn in the class?

7. 🇩🇪: Wer ist Sandra?
   🇬🇧: Who is Sandra?

8. 🇩🇪: Wie heißt der Lehrer?
   🇬🇧: What is the name of the teacher?

9. 🇩🇪: Was ist Sandras Lieblingsthema?
   🇬🇧: What is Sandra's favorite subject?

10. 🇩🇪: Wie viele Kinder gibt es in der Schule?
   🇬🇧: How many children are there in the school?

