In [1]:
import os
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
df = pd.read_csv("data/data.csv")
df.head()

Unnamed: 0,text
0,Die Männerrunde am Brunnen\n\nHerr Jakob steht...
1,Herr Jakob guckt den Männern zu wie sie\nDie S...
2,Die Männerrunde am Brunnen \nHerr Jakob beobac...
3,Die Männerrunden am Brunnen\n\nHerr Jakob beob...
4,Herr Jakob\n\nHerr jakob beobadete die Boote.\...


# Clean Text

In [3]:
import re

def clean_text(text: str):
    text = re.sub(r'\s+',' ', text).strip() # Remove extra spaces
    text = re.sub(r'[^\w\s.,!?äöüÄÖÜß-]', '', text)  # Remove non-alphabetic/emojis except German chars

    return text

# Spelling Correction

In [None]:
client = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")

In [4]:
def correct_spelling(text):
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "user", "content": f"Return the below text after spelling correction. Leave the rest of errors as it is. Do not do any formatting.\n\n{text}"},
        ],
        stream=False
    )
    return response.choices[0].message.content

# Count Verbs

In [5]:
import spacy
from spacy import displacy
from spacy.tokens import Span
from IPython.display import display, HTML
# Load spaCy German model
# Make sure to install it first with: python -m spacy download de_dep_news_trf
nlp = spacy.load("de_dep_news_trf")

In [6]:
# for label in nlp.get_pipe("tagger").labels:
#     print(label, " -- ", spacy.explain(label))

In [7]:
def count_verbs(text):
    doc = nlp(text)
    spans = []
    verb_counter = {'PRES': 0, 'PAST': 0, 'VERB': 0}
    for token in doc:
        if token.pos_ in ["VERB", "AUX"]:
            # Detect tense
            verb_counter['VERB'] += 1
            if "Tense=Pres" in token.morph:
                label = "PRES"
                verb_counter['PRES'] += 1
            elif "Tense=Past" in token.morph:
                label = "PAST"
                verb_counter['PAST'] += 1
            else:
                label = "VERB"

            span = Span(doc, token.i, token.i+1, label=label)
            spans.append(span)
    # Add spans as entities
    doc.ents = spans
    return verb_counter, doc

In [8]:
# Visualization options with different colors
options = {
    "ents": ["PRES", "PAST", "VERB"],
    "colors": {
        "PRES": "linear-gradient(90deg, #9cffac, #49e85d)", # green
        "PAST": "linear-gradient(90deg, #ff9c9c, #e84949)" , # red
        "VERB": "linear-gradient(90deg, #ffd799, #ffa500)", # orange
    }
}

## Original Text

In [9]:
text = clean_text(df.text.iloc[2])
text

'Die Männerrunde am Brunnen Herr Jakob beobachtete die Boote, weil er sie interresant fand. Der Mann laß die Boote zu Wasser, weil er daran Spaß hatte. Die Eisenbahn fuhr durch der Rand des Brunnens, weil er Züge mochte. Die Boote segelten übers Wasser, weil sie aus Holz gemacht wurden. Herr Jakob bastelte die Schienen zusammen, weil dann könnte die Eisenbahn nicht fahren. Herr Jakob kehrte zur Brunnen zurück, weil er neue Ideen hatte. Herr Jakob baute die Schienen auf, weil er die fuhren lassen will.'

In [10]:
verb_counter, doc = count_verbs(text)
print(f"Verb count: {verb_counter}")
html = displacy.render(doc, style="ent", options=options, page=True, jupyter=False)
display(HTML(html))

Verb count: {'PRES': 1, 'PAST': 13, 'VERB': 17}


## After Spelling Correction

In [11]:
spell_corrected_text = correct_spelling(text)

In [12]:
verb_counter, doc = count_verbs(spell_corrected_text)
print(f"Verb count: {verb_counter}")
html = displacy.render(doc, style="ent", options=options, page=True, jupyter=False)
display(HTML(html))

Verb count: {'PRES': 1, 'PAST': 13, 'VERB': 17}
