In [1]:
import time
from typing import Union
import warnings
import requests
from more_itertools import divide

CAB_URL = 'https://www.deutschestextarchiv.de/public/cab/query?clean=1&qname=q&a=default&fmt=raw&file=C%3A%5Cfakepath%5Ctest.txt'
CAB_HEADERS = headers = {'Content-Type': 'text/plain'}

def cab(text: str, delay: Union[float, None] = None) -> Union[str, None]:

    """
    Queries the CAB-Webservice provided by Deutsches Textarchiv for orthographic normalisation.
    If you use this function repeatedly (e.g. in a loop), please use with delay parameter to avoid overloading the server.
    If a texts exceeds the size of one megabyte, it is split into smaller parts
    and then sent to the service iteratively.
    """

    if delay is not None:
        time.sleep(delay)

    n_megabytes = len(text.encode('utf-8')) // 1000000

    if n_megabytes >= 1:
        parts = list(divide(n=n_megabytes+1, iterable=text.split(' ')))
        parts = [' '.join(part) for part in parts]
    else:
        parts = [text]

    normed_parts = []
    for part in parts:
        r = requests.post(url=CAB_URL, headers=CAB_HEADERS,
                          data=part.encode('UTF-8'))

        if r.status_code != 200:
            warnings.warn(
                f'Request returned with error code {r.status_code}\nError Message\n{r.body}')
            return None

        normed_parts.append(r.text.strip())

    return ' '.join(normed_parts)

In [2]:
import pandas as pd
meta = pd.read_csv("meta_drama.csv", encoding = "utf8")
meta.head()

Unnamed: 0.1,Unnamed: 0,id,title,author,period,type,genre,date,file,source,annotation,tokens_cleaned
0,0,1,Der sterbende Cato,Gottsched,Aufklaerung,Tragedy,drama,1731,gottsched-der-sterbende-cato.xml,https://github.com/dracor-org/gerdracor,,22047
1,1,2,Ein Deutsches Vorspiel,Neuber,Aufklaerung,,drama,1734,neuber-ein-deutsches-vorspiel.xml,https://github.com/dracor-org/gerdracor,,6480
2,2,3,Die Pietisterey im Fischbein-Rocke oder Die Do...,Gottsched,Aufklaerung,Comedy,drama,1736,gottschedin-die-pietisterey-im-fischbein-rocke...,https://github.com/dracor-org/gerdracor,,27691
3,3,4,Die von der Weisheit wider die Unwissenheit be...,Neuber,Aufklaerung,,drama,1736,neuber-die-beschuetzte-schauspielkunst.xml,https://github.com/dracor-org/gerdracor,,7377
4,4,5,Die Verehrung der Vollkommenheit durch die geb...,Neuber,Aufklaerung,,drama,1737,neuber-die-verehrung-der-vollkommenheit.xml,https://github.com/dracor-org/gerdracor,,7091


In [11]:
from os import listdir

def load_corpus(path):
    from numpy import append 
    from os import listdir
    
    texts = []
    filenames = []
    for filename in listdir(path):
        with open(path + "/" + filename, 'r', encoding="utf8") as f:
            texts.append(f.read())
            f.close()
        filenames.append(filename)    
    return texts, filenames

In [13]:
texts, filenames = load_corpus("corpora/corpus_lyrik_authors/")

In [14]:
len(texts)*3/60

6.7

In [15]:
len(texts)

134

In [None]:
for i in range(len(texts)):
    if i < 1:
        continue
    else:
        print(i)
        normalized_text = cab(texts[i], delay=3.0)
        with open(str("corpora/raw_normalized/corpus_lyrik/" + filenames[i]), "w", encoding="utf-16") as f:
            f.write(normalized_text)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
