/
de.toml
73 lines (68 loc) · 2.38 KB
/
de.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Replacements are done before splitting the text into sentences
replacements = [
["z.B.", "zum Beispiel"],
["z. B.", "zum Beispiel"],
["ca.", "circa"],
["bzw.", "beziehungsweise"],
["gem.", "gemäß"],
["sog.", "sogenannt"],
["Art.", "Artikel"],
["Abs.", "Absatz"],
["geb.", "geboren"],
["allg.", "allgemein"],
["bes.", "besonders"],
["bez.", "bezüglich"],
["eigtl.", "eigentlich"],
["gegr.", "gegründet"],
["ugs.", "umgangssprachlich"],
["urspr.", "ursprünglich"],
["usw.", "und so weiter"],
["zz.", "zurzeit"],
["ggf.", "gegebenenfalls"],
["bspw.", "beispielsweise"],
["mind.", "mindestens"],
["evtl.", "eventuell"],
["bzgl.", "bezüglich"],
["Nr.", "Nummer"],
]
segmenter = "python"
min_trimmed_length = 3
min_word_count = 2
max_word_count = 14
min_characters = 2
may_end_with_colon = false
quote_start_with_letter = true
needs_punctuation_end = false
needs_letter_start = true
needs_uppercase_start = true
allowed_symbols_regex = "[\u0020A-Za-zäöüßÄÖÜ\"„“‚‘’–\\.?!()]"
disallowed_symbols = []
broken_whitespace = [" ", " ,", " .", " ?", " !", " ;"]
even_symbols = ['"']
matching_symbols = [
["„", "“"],
["(", ")"]
]
# Abbreviations
# - A.B, Z.B., BÄM, or SMS (abbreviations with lowercase letters are taken care of further down)
# - AktG (usually abbreviations of law text names)
abbreviation_patterns = [
"[A-ZÄÖÜ]+\\.*[A-ZÄÖÜ]",
"[a-zäöü]+[A-ZÄÖÜ]",
]
# Other patterns
# - Jahrhundert and others at the beginning of the sentence (circumvents wrongly splitted such as "Im 3. Jahrhundert begann ...")
# - Months (usually wrongly splitted due to date number before - i.e. "3. März")
# - Sentence delimiter can only be at the end of a sentence. This also takes care of abbreviations.
# - No words with only one letter (" a.", " a", " a ", "a ", " ä")
# - Mixed upper/lowercase in words (LaSi - mostly chemical elements?)
# - Geburtstag and Titel which are usually followed by the number
# - Abbreviations which are not easily replaced or detected
other_patterns = [
"^(Jahrhundert|Liga|Bundesliga|Klasse|Platz|Grades|Runde|Division|Rang)",
"^(Januar|Februar|März|April|Mai|Juni|Juli|September|Oktober|November|Dezember)",
"[\\.|\\?|!].+$",
"(\\s[A-ZÄÖÜa-zäöü]{1}[\\.|\\?|!]*$)|(^[A-ZÄÖÜa-zäöü]{1}\\s)|\\s[A-ZÄÖÜa-zäöü]{1}\\s",
"[a-zäöü][A-ZÄÖÜ][a-zäöü]",
"\\shl.$",
]