In [2]:
from pathlib import Path
import re

import pandas as pd
import numpy as np

from compgen2 import Gov, Matcher, const, Preprocessing

In [3]:
# %load_ext autoreload
# %autoreload 2

# Showcase Preprocessing

#### Kreiere Testset mit typischen Fehlern

In [4]:
data_root = Path("../data")

In [5]:
# Lade verlustliste
vl = pd.read_parquet(data_root / const.FILENAME_VL)

In [6]:
testset = pd.concat([vl[vl.location.str.contains("[A-Za-zäöüÄÖÜßẞ]+\.")].sample(n=12, random_state=10),
                    vl[vl.location.str.contains("kr\.")].sample(n=3, random_state=10),
                    vl[vl.location.str.contains("[\[{(]")].sample(n=5, random_state=10),
                    vl[vl.location.str.contains(" verm.")].sample(n=2, random_state=10),
                    vl[vl.location.str.contains(" nicht")].sample(n=3, random_state=10),
                    vl[vl.location.str.contains("[#^]")].sample(n=2, random_state=10)]).drop(['loc_parts_count', 'loc_count'], axis=1).sample(frac=1, random_state=10).reset_index(drop=True)
testset

Unnamed: 0,location
0,"Ostrowine, Oels, Schles."
1,"Wald b. Neuburg, Schwaben"
2,"Rüstenhart, Gebweiler (Els.)"
3,"Obkaß, vermutl."
4,"Gr. Grävendorf, Merseburg"
5,"Bühl, Ob. Els."
6,"Uhlbach, O.-A. Cannstatt"
7,"Wettin, Sallkr."
8,"Pietenhofen, nicht Putenkofen"
9,"Friedrichroda, Mansf. Gebgskr."


### *Sonderzeichen*: 1. Entfernen der Korrekturen

Function replace_corrections_vl() for removing historical corrections '()[]' and modern-day corrections '{}' and variants
1. brackets including their content 
2. the word 'nicht' plus related content
3. the word 'korrigiert' and its variants plus related content
4. the word 'vermutlich' and its variants plus related content

In [7]:
testset['replace_corrs'] = Preprocessing.replace_corrections_vl(testset.location)
testset

Unnamed: 0,location,replace_corrs
0,"Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles."
1,"Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben"
2,"Rüstenhart, Gebweiler (Els.)","Rüstenhart, Gebweiler"
3,"Obkaß, vermutl.",Obkaß
4,"Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg"
5,"Bühl, Ob. Els.","Bühl, Ob. Els."
6,"Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt"
7,"Wettin, Sallkr.","Wettin, Sallkr."
8,"Pietenhofen, nicht Putenkofen",Pietenhofen
9,"Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr."


### *Sonderzeichen*: 2. Entfernen weiterer Sonderzeichen

Function for removing special characters: 
1. simply removed: ?^_"#*\:{}()[]!
2. replaced with ': ´`

In [8]:
testset['replace_chars'] = Preprocessing.replace_characters_vl(testset.replace_corrs)
testset

Unnamed: 0,location,replace_corrs,replace_chars
0,"Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles."
1,"Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben"
2,"Rüstenhart, Gebweiler (Els.)","Rüstenhart, Gebweiler","Rüstenhart, Gebweiler"
3,"Obkaß, vermutl.",Obkaß,Obkaß
4,"Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg"
5,"Bühl, Ob. Els.","Bühl, Ob. Els.","Bühl, Ob. Els."
6,"Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt"
7,"Wettin, Sallkr.","Wettin, Sallkr.","Wettin, Sallkr."
8,"Pietenhofen, nicht Putenkofen",Pietenhofen,Pietenhofen
9,"Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr."


### *Abkürzungen*: 1. Ersetzen allgemeiner Wortendungen

Function no 1. for substituting abbreviations:
    
Flexibly substitutes abbreviations that are part of a longer word (e.g. Seekr./Gebirgskr. -> kreis)

In [9]:
testset['subst_partial'] = Preprocessing.substitute_partial_words(testset['replace_chars'], data_root)
testset

Unnamed: 0,location,replace_corrs,replace_chars,subst_partial
0,"Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","ostrowine, oels, schles."
1,"Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","wald b. neuburg, schwaben"
2,"Rüstenhart, Gebweiler (Els.)","Rüstenhart, Gebweiler","Rüstenhart, Gebweiler","rüstenhart, gebweiler"
3,"Obkaß, vermutl.",Obkaß,Obkaß,obkaß
4,"Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","gr. grävendorf, merseburg"
5,"Bühl, Ob. Els.","Bühl, Ob. Els.","Bühl, Ob. Els.","bühl, ob. els."
6,"Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","uhlbach, o.-a. cannstatt"
7,"Wettin, Sallkr.","Wettin, Sallkr.","Wettin, Sallkr.","wettin, sallkreis"
8,"Pietenhofen, nicht Putenkofen",Pietenhofen,Pietenhofen,pietenhofen
9,"Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","friedrichroda, mansf. gebgskreis"


### *Abkürzungen*: 2. Löschen von nicht benötigten Typeninformationen

Function no 2. for substituting abbreviations: 
   
Removes unnecessary abbreviations and extra words that relates to types (e.g. Kr., Kreis, Amtshauptmannschaft)

In [10]:
testset['subst_delete'] = Preprocessing.substitute_delete_words(testset['subst_partial'], data_root)
testset

Unnamed: 0,location,replace_corrs,replace_chars,subst_partial,subst_delete
0,"Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","ostrowine, oels, schles.","ostrowine, oels, schles."
1,"Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","wald b. neuburg, schwaben","wald b. neuburg, schwaben"
2,"Rüstenhart, Gebweiler (Els.)","Rüstenhart, Gebweiler","Rüstenhart, Gebweiler","rüstenhart, gebweiler","rüstenhart, gebweiler"
3,"Obkaß, vermutl.",Obkaß,Obkaß,obkaß,obkaß
4,"Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","gr. grävendorf, merseburg","gr. grävendorf, merseburg"
5,"Bühl, Ob. Els.","Bühl, Ob. Els.","Bühl, Ob. Els.","bühl, ob. els.","bühl, ob. els."
6,"Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","uhlbach, o.-a. cannstatt","uhlbach, cannstatt"
7,"Wettin, Sallkr.","Wettin, Sallkr.","Wettin, Sallkr.","wettin, sallkreis","wettin, sallkreis"
8,"Pietenhofen, nicht Putenkofen",Pietenhofen,Pietenhofen,pietenhofen,pietenhofen
9,"Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","friedrichroda, mansf. gebgskreis","friedrichroda, mansf. gebgskreis"


### *Abkürzungen*: 3. Spezifische Ersetzungen

Function no 3. for substituting abbreviations:
    
Substitutes specific abbreviations

In [11]:
testset['subst_full'] = Preprocessing.substitute_full_words(testset['subst_delete'], data_root)
testset

Unnamed: 0,location,replace_corrs,replace_chars,subst_partial,subst_delete,subst_full
0,"Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","ostrowine, oels, schles.","ostrowine, oels, schles.","ostrowine, oels, schlesien"
1,"Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","wald b. neuburg, schwaben","wald b. neuburg, schwaben","wald b. neuburg, schwaben"
2,"Rüstenhart, Gebweiler (Els.)","Rüstenhart, Gebweiler","Rüstenhart, Gebweiler","rüstenhart, gebweiler","rüstenhart, gebweiler","rüstenhart, gebweiler"
3,"Obkaß, vermutl.",Obkaß,Obkaß,obkaß,obkaß,obkaß
4,"Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","gr. grävendorf, merseburg","gr. grävendorf, merseburg","groß grävendorf, merseburg"
5,"Bühl, Ob. Els.","Bühl, Ob. Els.","Bühl, Ob. Els.","bühl, ob. els.","bühl, ob. els.","bühl, ober elsaß"
6,"Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","uhlbach, o.-a. cannstatt","uhlbach, cannstatt","uhlbach, cannstatt"
7,"Wettin, Sallkr.","Wettin, Sallkr.","Wettin, Sallkr.","wettin, sallkreis","wettin, sallkreis","wettin, sallkreis"
8,"Pietenhofen, nicht Putenkofen",Pietenhofen,Pietenhofen,pietenhofen,pietenhofen,pietenhofen
9,"Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","friedrichroda, mansf. gebgskreis","friedrichroda, mansf. gebgskreis","friedrichroda, mansfelder gebgskreis"


### Zusätzlich optionale Funktion: " i." (für in) durch Komma ersetzen 

(Achtung: n+1 Bestandteile)

In [12]:
testset['subst_i'] = testset.subst_full.replace(to_replace=" i\.", value=",", regex=True)
testset

Unnamed: 0,location,replace_corrs,replace_chars,subst_partial,subst_delete,subst_full,subst_i
0,"Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","Ostrowine, Oels, Schles.","ostrowine, oels, schles.","ostrowine, oels, schles.","ostrowine, oels, schlesien","ostrowine, oels, schlesien"
1,"Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","Wald b. Neuburg, Schwaben","wald b. neuburg, schwaben","wald b. neuburg, schwaben","wald b. neuburg, schwaben","wald b. neuburg, schwaben"
2,"Rüstenhart, Gebweiler (Els.)","Rüstenhart, Gebweiler","Rüstenhart, Gebweiler","rüstenhart, gebweiler","rüstenhart, gebweiler","rüstenhart, gebweiler","rüstenhart, gebweiler"
3,"Obkaß, vermutl.",Obkaß,Obkaß,obkaß,obkaß,obkaß,obkaß
4,"Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","Gr. Grävendorf, Merseburg","gr. grävendorf, merseburg","gr. grävendorf, merseburg","groß grävendorf, merseburg","groß grävendorf, merseburg"
5,"Bühl, Ob. Els.","Bühl, Ob. Els.","Bühl, Ob. Els.","bühl, ob. els.","bühl, ob. els.","bühl, ober elsaß","bühl, ober elsaß"
6,"Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","Uhlbach, O.-A. Cannstatt","uhlbach, o.-a. cannstatt","uhlbach, cannstatt","uhlbach, cannstatt","uhlbach, cannstatt"
7,"Wettin, Sallkr.","Wettin, Sallkr.","Wettin, Sallkr.","wettin, sallkreis","wettin, sallkreis","wettin, sallkreis","wettin, sallkreis"
8,"Pietenhofen, nicht Putenkofen",Pietenhofen,Pietenhofen,pietenhofen,pietenhofen,pietenhofen,pietenhofen
9,"Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","Friedrichroda, Mansf. Gebgskr.","friedrichroda, mansf. gebgskreis","friedrichroda, mansf. gebgskreis","friedrichroda, mansfelder gebgskreis","friedrichroda, mansfelder gebgskreis"
