In [2]:
"""
Script to annotate plain text with spacy for annotation. 

Script written by Christof Schöch (Trier), January 2023, adapted by Agnes Hilger (Würzburg), Februar 2023.
"""


# === Imports ===

#== Basics
#== Basics
import os
import random
import re
from os.path import join
import numpy as np
import glob

#== Data
import pandas as pd
import seaborn as sns

# Linguistic annotation
import spacy
import de_core_news_sm

In [3]:
# === Global variables ===

# Local data
workdir = "/Users/agneshilger/hardseeds_eltec/"
textfolder = "/Users/agneshilger/hardseeds_eltec/data/ELTeC-deu/txt_files_deu/*.txt"
annotatedfolder = join("/", workdir, "data", "annotated_ELTeC-deu", "")


# === Functions === 


def read_textfile(file): 
    """
    Reads a plain text file from the text repository. 
    Returns: string (containing the complete plain text). 
    """
    with open(file, "r", encoding="utf8") as infile: 
        text = infile.read()
    text = re.sub("\t", "", text)
    #text = text[0:2000] # for testing
    return text


def annotate_text(text, nlp): 
    """
    Annotates the text using the spacy NLP model activated initially.
    Returns: list (list of tokens with annotation according to spacy data model)
    """
    nlp.max_length = len(text) + 1000
    annotated = nlp(text)
    #print([(w.text, w.pos_, w.lemma_, w.tag_, w.dep_, w.morph) for w in annotated[0:25]])
    return annotated


def save_annotated(basename, annotated): 
    serialized = [t.text + "\t" + t.pos_ + "\t" + t.lemma_ + "\t" + t.dep_ + "\t" + str(t.morph) for t in annotated if t.pos_ != "SPACE" and t.text != " "]
    serialized = "\n".join(serialized)
    serialized = "wordform\tpos\tlemma\tdep\tmorph\n" + serialized
    annotatedfile = join(annotatedfolder, basename + ".csv")
    with open(annotatedfile, "w", encoding="utf8") as outfile: 
        outfile.write(serialized)


# === Coordination function === 

def main():
    """
    Coordinates the process. 
    Loads nlp model from spacy. 
    Then loops over each text to annotate, and saves annotation to disk. 
    """
    nlp = spacy.load("de_core_news_sm", disable=["ner"])
    spacy.prefer_gpu()
    progress = 0
    plain_files = [os.path.basename(file).split(".")[0] for file in glob.glob(join(textfolder))]
    already_annotated = [os.path.basename(file).split(".")[0] for file in glob.glob(join(annotatedfolder, "*.csv"))]
    print("Total plain files:", len(plain_files), "| already annotated:", len(already_annotated))
    for file in glob.glob(textfolder): 
        basename, ext = os.path.basename(file).split(".")
        if basename not in already_annotated: 
            text = read_textfile(file)
            annotated = annotate_text(text, nlp)
            save_annotated(basename, annotated)
            print(progress, basename, str(len(re.split("\W+", text))) + ": done.")
        else: 
            print(progress, basename, ": already annotated.")
        progress +=1

In [None]:
main()

Total plain files: 100 | already annotated: 74
0 DEU027 : already annotated.
1 DEU033 : already annotated.
2 DEU032 : already annotated.
3 DEU026 : already annotated.
4 DEU030 : already annotated.
5 DEU024 : already annotated.
6 DEU018 : already annotated.
7 DEU019 : already annotated.
8 DEU025 : already annotated.
9 DEU031 : already annotated.
10 DEU009 : already annotated.
11 DEU035 : already annotated.
12 DEU021 : already annotated.
13 DEU020 : already annotated.
14 DEU034 : already annotated.
15 DEU008 : already annotated.
16 DEU022 : already annotated.
17 DEU036 : already annotated.
18 DEU037 : already annotated.
19 DEU023 : already annotated.
20 DEU044 : already annotated.
21 DEU050 : already annotated.
22 DEU078 : already annotated.
23 DEU087 : already annotated.
24 DEU093 : already annotated.
25 DEU092 : already annotated.
26 DEU086 : already annotated.
27 DEU079 : already annotated.
28 DEU051 : already annotated.
29 DEU045 : already annotated.
30 DEU053 : already annotated.
31

In [63]:
textfolder = "/Users/agneshilger/innerlife-main/data/*.txt"
annotatedfolder = "/Users/agneshilger/innerlife-main/data/annotated/"

In [64]:
main()

Total plain files: 1 | already annotated: 0
0 k00200001663 104190: done.
