In [1]:
"""
Script to count verbs of "inner life" in collections of unannotated files. 
Uses spacy for annotation. (Optimization: use TreeTagger with PRESTO model for 18th century.)
Outputs verb count information in the same style as Lou's and/or Diana's scripts, for compatibility. 

Script written by Christof Schöch (Trier), January 2023, adapted by Agnes Hilger (Würzburg), Februar 2023.
"""


# === Imports ===

#== Basics
import os
import random
import re
from os.path import join
import numpy as np
import glob

#== Data
import pandas as pd
import seaborn as sns

# Linguistic annotation
import spacy
import de_core_news_sm

In [2]:
#== global variables

workdir = "/Users/agneshilger/innerlife-main/pipeline-deu/"
#textfolder = "/Users/agneshilger/Documents/Korpora/korpus_maba_gesaeubert_2.0/*.txt"
annotatedfolder = join("/", workdir, "data", "annotated", "", "*.csv")
results = "/Users/agneshilger/innerlife-main/pipeline-deu/results.csv"

In [16]:
# === Functions === 


def read_verbsfile(verbsfile): 
    """
    Reads the file with the list of verbs of inner life, 
    along with the different categories of verbs. 
    Returns: Three lists (lemmas, categories, combined labels)
    """
    with open(verbsfile, "r", encoding="utf8") as infile: 
        data = pd.read_csv(infile, sep=";")
    verblemmas = list(data["lemma"])
    verbcats = list(data["category"])
    verblabels = []
    for i in range(0, len(verblemmas)): 
        verblabels.append(verblemmas[i] + ":" + verbcats[i])
    #print(verblabels)
    return verblemmas, verbcats, verblabels


def read_metadatafile(metadatafile): 
    """
    Reads the metadatafile of the collection. 
    Returns: DataFrame. 
    """
    with open(metadatafile, "r", encoding="utf8") as infile: 
        metadata = pd.read_csv(infile, sep=";", index_col="filename")
    return metadata


def get_pubyear(metadata, basename): 
    """
    Extracts the year of first publication for the current text from the metadata table.
    Returns: int (year)
    """
    try: 
        pubyear = int(metadata.loc[basename, "Jahr"])
    except: 
        pubyear = metadata.loc[basename, "Jahr"]
    return pubyear


def get_authorgender(metadata, basename): 
    """
    Extracts the gender of the author.
    Returns: str (M/F/U)
    """
    authorgender = str(metadata.loc[basename, "Geschlecht"])
    return authorgender



def read_annotated(file): 
    """
    Reads an annotated text file from the folder. 
    Returns: list (each item containing one line / token). 
    """
    with open(file, "r", encoding="utf8") as infile: 
        annotated = pd.read_csv(infile, sep="\t")
    try: 
        annotated.columns = ["wordform", "pos", "lemma", "dep", "morph"]
    except: 
        annotated.columns = ["wordform", "pos", "lemma"]
    annotated = annotated.loc[:,["wordform", "pos", "lemma"]]
    return annotated


def count_verbs(annotated, verblemmas): 
    """
    Establishes the number of tokens marked as a verb in the annotated text (=allverbcounts). 
    Establishes the count of each individual verb from the list of verbs of inner life (=indverbcounts). 
    Calculates the sum of counts of verbs of inner life (=innerverbcounts). 
    Returns: int, int, dict
    """
    verbs = annotated[annotated["pos"] == "VERB"]
    #print(verbs.head(10))
    allverbscount = len(verbs)
    innerverbs = verbs[verbs["lemma"].isin(verblemmas)]
    innerverbscount = len(innerverbs)
    indverbcounts = {}
    for lemma in verblemmas: 
        indverbcounts[lemma] = len([verb for verb in list(verbs["lemma"]) if lemma in verb])
    #print(indverbcounts)
    return allverbscount, innerverbscount, indverbcounts


def save_verbcounts(data, verblabels, resultsfile): 
    """
    Saves the combined data from each text to disc. 
    Renames the columns to include the verb category for each individual verb. 
    Returns: nothing (but saves CSV to disk)
    """
    labels = ["0TextId", "year", "author-gender", "verbs", "innerVerbs"]
    labels.extend(verblabels)
    data_df = pd.DataFrame.from_dict(data, orient="columns").T
    data_df.columns = labels
    with open(resultsfile, "w", encoding="utf8") as outfile: 
        data_df.to_csv(outfile, sep=";", index=None)


# === Coordination function === 

def main(verbsfile, annotatedfolder, metafile):
    """
    Coordindates the entire process. 
    Some things need to be done only once (get metadata, get verbs). 
    Then loops over each text to get year of publication and establish verb counts.
    Verb count information is collected and saved to disk in the end. 
    """
    verblemmas, verbcats, verblabels = read_verbsfile(verbsfile)
    progress = 0
    data = {}
    
    
    metadata = read_metadatafile(metafile)
    
    for file in glob.glob(annotatedfolder): 
            basename, ext = os.path.basename(file).split(".")
            pubyear = get_pubyear(metadata, basename)
            print("Now:", progress, basename, pubyear, end=" ")
            authorgender = get_authorgender(metadata, basename)
            annotated = read_annotated(file)
            allverbcounts, innerverbcounts, indverbcounts = count_verbs(annotated, verblemmas)
            verbdata = {"0TextId" : basename, "pubyear" : pubyear, "author-gender" : authorgender, "verbs" : allverbcounts, "innerVerbs" : innerverbcounts} 
            verbdata.update(indverbcounts)
            progress +=1
            print("Done.")
            data[basename] = verbdata
    save_verbcounts(data, verblabels, results)

In [11]:
files = glob.glob(annotatedfolder)

In [9]:
metadata = read_metadatafile("metadata.csv")

In [58]:
main("innerverbs_deu.csv", annotatedfolder, "metadata.csv")

Now: 0 k00200002395 1775 Done.
Now: 1 k00100000095 1801 Done.
Now: 2 k00100000081 1889 Done.
Now: 3 k00200000743 1908 Done.
Now: 4 k00100000268 1909 Done.
Now: 5 k00200002140 1881 Done.
Now: 6 k00200000031 1871 Done.
Now: 7 k00100000254 1896 Done.
Now: 8 k00100000240 1908 Done.
Now: 9 k00200002197 1900 Done.
Now: 10 k00200003289 1882 Done.
Now: 11 k00100000297 1773 Done.
Now: 12 k00200000964 1908 Done.
Now: 13 k00200000970 1891 Done.
Now: 14 k00100000308 1781 Done.
Now: 15 k00200000145 1906 Done.
Now: 16 k00200002752 1903 Done.
Now: 17 k00100000446 1795 Done.
Now: 18 k00200003302 1900 Done.
Now: 19 k00100000320 1879 Done.
Now: 20 k00100000334 1910 Done.
Now: 21 k00200001501 1869 Done.
Now: 22 k00200002785 1917 Done.
Now: 23 k00200002791 1901 Done.
Now: 24 k00200001298 1904 Done.
Now: 25 k00200002961 1900 Done.
Now: 26 k00100000122 1812 Done.
Now: 27 k00200001065 1911 Done.
Now: 28 k00200003114 1910 Done.
Now: 29 k00200002593 1906 Done.
Now: 30 k00100000137 1819 Done.
Now: 31 k001000001

Now: 256 k00200000343 1890 Done.
Now: 257 k00100000132 1899 Done.
Now: 258 k00100000126 1868 Done.
Now: 259 k00200000182 1911 Done.
Now: 260 k00200001288 1919 Done.
Now: 261 k00200002965 1868 Done.
Now: 262 k00100000318 1867 Done.
Now: 263 k00200002742 1890 Done.
Now: 264 k00200001505 1854 Done.
Now: 265 k00100000330 1912 Done.
Now: 266 k00200001511 1920 Done.
Now: 267 k00100000324 1896 Done.
Now: 268 k00200000169 1902 Done.
Now: 269 k00100000442 1909 Done.
Now: 270 k00200002187 1912 Done.
Now: 271 k00200002805 1912 Done.
Now: 272 k00200000974 1898 Done.
Now: 273 k00100000287 1786 Done.
Now: 274 k00200000960 1915 Done.
Now: 275 k00200002811 1910 Done.
Now: 276 k00200000035 1869 Done.
Now: 277 k00100000278 1891 Done.
Now: 278 k00200000747 1838 Done.
Now: 279 k00200000009 1854 Done.
Now: 280 k00200002178 1884 Done.
Now: 281 k00100000244 1857 Done.
Now: 282 k00100000250 1885 Done.
Now: 283 k00200002391 1913 Done.
Now: 284 k00200000586 1899 Done.
Now: 285 k00100000085 1788 Done.
Now: 286 k

Now: 505 k00100000436 1766 Done.
Now: 506 k00200000082 1920 Done.
Now: 507 k00200002695 1901 Done.
Now: 508 k00200002859 1897 Done.
Now: 509 k00200000096 1915 Done.
Now: 510 k00200000900 1864 Done.
Now: 511 k00200002865 1914 Done.
Now: 512 k00200000041 1919 Done.
Now: 513 k00200000733 1897 Done.
Now: 514 k00100000218 1804 Done.
Now: 515 k00100000230 1833 Done.
Now: 516 k00100000224 1888 Done.
Now: 517 k00200001411 1879 Done.
Now: 518 k00200002454 1897 Done.
Now: 519 k00200002468 1914 Done.
Now: 520 k00200001161 1891 Done.
Now: 521 k00200000519 1920 Done.
Now: 522 k00100000026 1853 Done.
Now: 523 k00100000030 1890 Done.
Now: 524 k00200001163 1917 Done.
Now: 525 k00100000024 1846 Done.
Now: 526 k00200002318 1917 Done.
Now: 527 k00200000527 1905 Done.
Now: 528 k00200002330 1905 Done.
Now: 529 k00200000533 1789 Done.
Now: 530 k00100000018 1854 Done.
Now: 531 k00200001361 1911 Done.
Now: 532 k00200001407 1889 Done.
Now: 533 k00200001413 1890 Done.
Now: 534 k00100000226 1854 Done.
Now: 535 k

Now: 754 k00200001418 1885 Done.
Now: 755 k00100000239 1784 Done.
Now: 756 k00100000211 1886 Done.
Now: 757 k00200001424 1874 Done.
Now: 758 k00200001342 1838 Done.
Now: 759 k00200001356 1909 Done.
Now: 760 k00100000205 1809 Done.
Now: 761 k00200001430 1905 Done.
Now: 762 k00200002313 1887 Done.
Now: 763 k00200000510 1860 Done.
Now: 764 k00200002461 1812 Done.
Now: 765 k00200001140 1889 Done.
Now: 766 k00200002449 1895 Done.
Now: 767 k00200001626 1915 Done.
Now: 768 k00100000013 1840 Done.
Now: 769 k00200000538 1904 Done.
Now: 770 k00100000007 1815 Done.
Now: 771 k00200002311 1909 Done.
Now: 772 k00200000506 1889 Done.
Now: 773 k00200001618 1895 Done.
Now: 774 k00200002463 1880 Done.
Now: 775 k00100000039 1897 Done.
Now: 776 k00200001426 1885 Done.
Now: 777 k00100000213 1907 Done.
Now: 778 k00200000704 1876 Done.
Now: 779 k00200002661 1899 Done.
Now: 780 k00200002852 1907 Done.
Now: 781 k00200000937 1826 Done.
Now: 782 k00200001397 1882 Done.
Now: 783 k00200000089 1882 Done.
Now: 784 k

Now: 1006 k00200000188 1875 Done.
Now: 1007 k00200002953 1912 Done.
Now: 1008 k00200000177 1804 Done.
Now: 1009 k00200000163 1881 Done.
Now: 1010 k00100000448 1877 Done.
Now: 1011 k00200002774 1919 Done.
Now: 1012 k00100000306 1781 Done.
Now: 1013 k00100000312 1828 Done.
Now: 1014 k00200002990 1859 Done.
Now: 1015 k00100000299 1802 Done.
Now: 1016 k00200003287 1900 Done.
Now: 1017 k00200000003 1908 Done.
Now: 1018 k00200000765 1861 Done.
Now: 1019 k00200001321 1909 Done.
Now: 1020 k00200001447 1901 Done.
Now: 1021 k00100000266 1903 Done.
Now: 1022 k00200000995 1912 Done.
Now: 1023 k00200001453 1916 Done.
Now: 1024 k00200003091 1907 Done.
Now: 1025 k00200000598 1899 Done.
Now: 1026 k00200002370 1895 Done.
Now: 1027 k00200000567 1897 Done.
Now: 1028 k00100000058 1888 Done.
Now: 1029 k00100000070 1894 Done.
Now: 1030 k00200002358 1916 Done.
Now: 1031 k00200001121 1909 Done.
Now: 1032 k00200000559 1911 Done.
Now: 1033 k00100000072 1899 Done.
Now: 1034 k00200003044 1900 Done.
Now: 1035 k002

In [23]:
files = glob.glob(annotatedfolder)
files

['/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00100000056.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00200001663.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00200001111.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00100000042.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00200000569.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00200002395.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00100000095.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00100000081.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00200000743.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00100000268.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00200002140.csv',
 '/Users/agneshilger/innerlife-main/pipeline-fra/data/annotated/k00200000031.csv',
 '/U