In [1]:
import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
df = pd.read_csv("data/data.csv")
df.head()

Unnamed: 0,text,llm_corrected_text
0,Die Männerrunde am Brunnen\n\nHerr Jakob steht...,Die Männerrunde am Brunnen Herr Jakob steht am...
1,Herr Jakob guckt den Männern zu wie sie\nDie S...,Herr Jakob guckt den Männern zu wie sie die Sc...
2,Die Männerrunde am Brunnen \nHerr Jakob beobac...,Die Männerrunde am Brunnen Herr Jakob beobacht...
3,Die Männerrunden am Brunnen\n\nHerr Jakob beob...,Die Männerrunden am Brunnen. Herr Jakob beobac...
4,Herr Jakob\n\nHerr jakob beobadete die Boote.\...,Herr Jakob Herr Jakob beobachtete die Boote. D...


# Clean Text

In [7]:
import re

def clean_text(text: str):
    text = re.sub(r'\s+',' ', text).strip() # Remove extra spaces
    text = re.sub(r'[^\w\s.,!?äöüÄÖÜß-]', '', text)  # Remove non-alphabetic/emojis except German chars

    return text

# Spelling Correction

In [5]:
client = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")

In [6]:
def correct_spelling(text):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "user", "content": f"Return the below text after spelling correction. Leave the rest of errors as it is. Do not do any formatting.\n\n{text}"},
        ],
        stream=False
    )
    return response.choices[0].message.content

In [7]:
# corrected_text = []
# for i in df.text:
#     text_ = clean_text(i)
#     text_ = correct_spelling(text_)
#     corrected_text.append(text_)
# df['llm_corrected_text'] = corrected_text
# df.to_csv("data/data.csv", index=False)

# Count Verbs

In [3]:
import spacy
from spacy import displacy
from spacy.tokens import Span
from IPython.display import display, HTML
# Load spaCy German model
# Make sure to install it first with: python -m spacy download de_dep_news_trf
nlp = spacy.load("de_dep_news_trf")

In [6]:
# for label in nlp.get_pipe("tagger").labels:
#     print(label, " -- ", spacy.explain(label))

In [4]:
def count_verbs(text):
    doc = nlp(text)
    spans = []
    verb_counter = {'PRES': 0, 'PAST': 0, 'VERB': 0}
    for token in doc:
        if token.pos_ in ["VERB", "AUX"]:
            # Detect tense
            verb_counter['VERB'] += 1
            if "Tense=Pres" in token.morph:
                label = "PRES"
                verb_counter['PRES'] += 1
            elif "Tense=Past" in token.morph:
                label = "PAST"
                verb_counter['PAST'] += 1
            else:
                label = "VERB"

            span = Span(doc, token.i, token.i+1, label=label)
            spans.append(span)
    # Add spans as entities
    doc.ents = spans
    return verb_counter, doc

In [5]:
# Visualization options with different colors
options = {
    "ents": ["PRES", "PAST", "VERB"],
    "colors": {
        "PRES": "linear-gradient(90deg, #9cffac, #49e85d)", # green
        "PAST": "linear-gradient(90deg, #ff9c9c, #e84949)" , # red
        "VERB": "linear-gradient(90deg, #ffd799, #ffa500)", # orange
    }
}

## Write output to HTML

In [15]:
html_parts = ["""
<!DOCTYPE html>
<html>
<head>
    <title>Language Evaluation Results</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        .section { margin: 30px 0; border-bottom: 1px solid #ccc; padding-bottom: 20px; }
        .text-index { font-size: 24px; font-weight: bold; color: #333; }
        .counts { font-size: 16px; margin: 10px 0; }
        .original { color: #d32f2f; }
        .corrected { color: #388e3c; }
    </style>
</head>
<body>
    <h1>Language Evaluation Results</h1>
"""]

for ind, values in enumerate(df.values):
    ori_count, ori_doc = count_verbs(clean_text(values[0]))
    corr_count, corr_doc = count_verbs(values[1])

    html_parts.append(f'<div class="section">')
    html_parts.append(f'<div class="text-index">Text: {ind + 1}</div>')

    html_parts.append(f'<div class="counts original">Original: {ori_count}</div>')
    html = displacy.render(ori_doc, style="ent", options=options, page=False, minify=True, jupyter=False)
    html_parts.append(html)

    html_parts.append(f'<div class="counts corrected">Spellings Corrected(LLM): {corr_count}</div>')
    html = displacy.render(corr_doc, style="ent", options=options, page=False, minify=True, jupyter=False)
    html_parts.append(html)
    html_parts.append('</div>')

html_parts.append("""
</body>
</html>
""")
# Combine all HTML parts and save to file
complete_html = '\n'.join(html_parts)
with open('data/language_evaluation_results.html', 'w', encoding='utf-8') as f:
    f.write(complete_html)


## Original Text

text = clean_text(df.text.iloc[2])
text

verb_counter, doc = count_verbs(text)
print(f"Verb count: {verb_counter}")
html = displacy.render(doc, style="ent", options=options, page=True, jupyter=False)
display(HTML(html))

## After Spelling Correction

spell_corrected_text = correct_spelling(text)

verb_counter, doc = count_verbs(spell_corrected_text)
print(f"Verb count: {verb_counter}")
html = displacy.render(doc, style="ent", options=options, page=True, jupyter=False)
display(HTML(html))