# Preprocessing

## normalize texts

- use CAD

## split corpus into sentences

- use pysbd
- safe these as .csv - > IF, IIIIFFFFF, possible, otherwise use .jsonl

## create pattern for Prodigy

- use se-list from Berenike and Giulia

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("metadata.csv", encoding="utf8", sep=";")

### normalize texts

In [3]:
import time
from typing import Union
import warnings
import requests
from more_itertools import divide

CAB_URL = 'https://www.deutschestextarchiv.de/public/cab/query?clean=1&qname=q&a=default1.1800-1900&fmt=raw&file=C%3A%5Cfakepath%5Ctest.txt'
CAB_HEADERS = headers = {'Content-Type': 'text/plain'}

def cab(text: str, delay: Union[float, None] = None) -> Union[str, None]:

    """
    Queries the CAB-Webservice provided by Deutsches Textarchiv for orthographic normalisation.
    If you use this function repeatedly (e.g. in a loop), please use with delay parameter to avoid overloading the server.
    If a texts exceeds the size of one megabyte, it is split into smaller parts
    and then sent to the service iteratively.
    """

    if delay is not None:
        time.sleep(delay)

    n_megabytes = len(text.encode('utf-8')) // 1000000

    if n_megabytes >= 1:
        parts = list(divide(n=n_megabytes+1, iterable=text.split(' ')))
        parts = [' '.join(part) for part in parts]
    else:
        parts = [text]

    normed_parts = []
    for part in parts:
        r = requests.post(url=CAB_URL, headers=CAB_HEADERS,
                          data=part.encode('UTF-8'))

        if r.status_code != 200:
            warnings.warn(
                f'Request returned with error code {r.status_code}\nError Message\n{r.body}')
            return None

        normed_parts.append(r.text.strip())

    return ' '.join(normed_parts)

In [14]:
# To-Do: 
# - Mouse-Wiggler
# look here: https://stackoverflow.com/questions/1181464/controlling-mouse-with-python

from tqdm import tqdm
import os

corpus_files = list(df["filename"])
cleaned_files = os.listdir("src/corpus_cleaned/")

files_to_CAB = list(set(corpus_files).difference(cleaned_files))

for file in tqdm(broken_files):
    with open("src/corpus/" + file, "r", encoding="utf8") as f:
        text = f.read()
    try:
        cleaned_text = cab(text, 3)
        
        with open("src/corpus_cleaned/" + file, "w", encoding="utf8") as f:
            f.write(cleaned_text)
    except:
        continue
    
    with open("log.txt", "a", encoding = "utf8") as f:
        f.write(file + "\n")

100%|██████████| 1/1 [01:06<00:00, 66.80s/it]


In [15]:
broken_files=[]

for file in cleaned_files:
    with open("src/corpus_cleaned/" + file, "r", encoding="utf8") as f:
        text = f.read()
        
    if len(text.split())<5:
        broken_files.append(file)
    
len(broken_files)

0

In [16]:
broken_files

[]

### split into senteces

In [41]:
header="sentence\n"

with open("annotation_data.tsv", "w", encoding="utf8") as f:
    f.write(header)

In [12]:
import pysbd
import win32api
import time
import math
import ctypes
from tqdm import tqdm
import os



# initalize sentencizer
seg = pysbd.Segmenter(language="de", # language is German
                      clean=True) # cleans unneccesary punctuation, which reduces filesize (a little). Since we only look at tokens, this is ok

for file in tqdm(cleaned_files[503:]):
    with open("src/corpus_cleaned/" + file, "r", encoding="utf8") as f:
        text = f.read()
    
    for sentence in seg.segment(text):
        with open("annotation_data.tsv", "a", encoding="utf8") as f:
            f.write(sentence + "\n")

 12%|█▏        | 170/1405 [14:53:19<108:09:43, 315.29s/it]


KeyboardInterrupt: 

In [16]:
# sentences[5:10]

In [None]:
df=pd.DataFrame()
df["text"]=sentences

In [None]:
df

In [None]:
df.to_csv("annotation_data.csv", sep="\t", encoding="utf8")

In [3]:
se_df = pd.read_excel("all_entities_int_EN.xlsx")

In [4]:
se_df.head()

Unnamed: 0,category,word,type,Austria,Italy,Germany,France,Switzerland,type_grouped
0,furniture,Abe,interior,,,,,,INTERIOR
1,furniture,Abort,interior,,,1.0,,,INTERIOR
2,architecture,Abstellkammer,interior,,,,,,INTERIOR
3,architecture,Arbeitszimmer,interior,,,,,,INTERIOR
4,architecture,Attika,interior,,,,,,INTERIOR


In [5]:
se_df["category"].value_counts()

category
village         124812
mountain         25561
stream_lake      13908
city              5487
forest            2553
valley             532
nat_terms          531
rural              271
urban              258
furniture           74
architecture        48
Name: count, dtype: int64

In [15]:
spatial_entities = []

for entity in list(se_df["word"].loc[se_df["category"]!="furniture"]):
    spatial_entities.append([{"lower": str(entity).lower()}])

In [16]:
label = []

for spatial_entity in spatial_entities:
    label.append("SpatialEntity")

In [17]:
spatial_entities[:5]

[[{'lower': 'abstellkammer'}],
 [{'lower': 'arbeitszimmer'}],
 [{'lower': 'attika'}],
 [{'lower': 'bad'}],
 [{'lower': 'badezimmer'}]]

In [18]:
patterns_df = pd.DataFrame()
patterns_df["label"]=label
patterns_df["pattern"]=spatial_entities

In [19]:
patterns_df

Unnamed: 0,label,pattern
0,SpatialEntity,[{'lower': 'abstellkammer'}]
1,SpatialEntity,[{'lower': 'arbeitszimmer'}]
2,SpatialEntity,[{'lower': 'attika'}]
3,SpatialEntity,[{'lower': 'bad'}]
4,SpatialEntity,[{'lower': 'badezimmer'}]
...,...,...
173972,SpatialEntity,[{'lower': 'niderbauen-chulm'}]
173973,SpatialEntity,[{'lower': 'kronberg'}]
173974,SpatialEntity,[{'lower': 'farneren'}]
173975,SpatialEntity,[{'lower': 'schwändiblueme'}]


In [20]:
patterns_df.dropna().to_json(
    "pattern.jsonl", 
    orient="records", 
    lines=True)