### Getting Text Passages from WOI

In [1]:
import re
from collections import defaultdict
from typing import Iterator

import numpy as np
import pandas as pd
import syntok.segmenter as segmenter
import tqdm
import umap.plot
from optimum.onnxruntime import ORTModelForFeatureExtraction
from torch import Tensor
from transformers import AutoTokenizer, pipeline
from transformers.tokenization_utils_base import BatchEncoding
from umap import UMAP

In [2]:
df = pd.read_csv("df_cleaned_with_punc_years.csv")
df.sample(10)

Unnamed: 0.1,Unnamed: 0,year,text
20,20,1966.0,"Bonn, im Mai. Der Mai war längst gekommen. Unt..."
34,34,1980.0,"Von Andreas Kohlschütter Im Kunartal, Ende Mä..."
53,53,1999.0,Seit Jahren wiederholen sich die Grabschändung...
4,4,1950.0,Daß manchen Kreisen Großbritanniens die Erfolg...
62,62,2008.0,Der Preis für einen Liter Benzin ist in Deutsc...
47,47,1993.0,Von Dietrich Willier UlmSarajevo Mißtrauisch s...
68,68,2014.0,Bei der Parlamentswahl in Nordkorea hat die Pa...
3,3,1949.0,"Von Erich Trunz Dämon Wie an dem Tag, der dich..."
21,21,1967.0,Von Josef MüllerMarein Konstantin II. ist nich...
14,14,1960.0,Bundeskanzler Dr. Adenauer gestand vor wenigen...


In [125]:
MODEL_NAME = "bert-base-german-cased"

SUBWORD_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
MODEL = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, from_transformers=True)

TOKEN_PATTERN = re.compile(r"\w+")

# insert word(s) of interest here

WOI = {
 
   "tierschutz" 
 # ...............   
}

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [126]:
def split_into_sentences(text: str) -> Iterator[str]:
    """Split the given text into sentences.
    
    Parameters
    ----------
    text : str
        Text to split into sentences.
        
    Yields
    ------
    Iterator[str]
        One sentence at a time.
    """
    for paragraph in segmenter.process(text):
        for sentence in paragraph:
            yield "".join(str(token) for token in sentence).strip()


def split_into_tokens(text: str) -> list[str]:
    """Split the given text into tokens.
    
    Parameters
    ----------
    text : str
        Text to split into tokens.
        
    Returns
    -------
    list[str]
        List of tokens.
    """
    return TOKEN_PATTERN.findall(text)


def split_into_subword_tokens(tokens: list[str]) -> BatchEncoding:
    """Split the given tokens into subword tokens.
    
    Parameters
    ----------
    tokens : list[str]
        Tokens to split into subword tokens.
        
    Returns
    -------
    BatchEncoding
        Subword tokens.
    """
    return SUBWORD_TOKENIZER(tokens, is_split_into_words=True, return_tensors="pt")


def is_relevant(tokens: list[str]) -> bool:
    """Checks if the given tokens are relevant, i.e. contain at least one WOI.
    
    Parameters
    ----------
    tokens : list[str]
        List of tokens to check.
        
    Returns
    -------
    list[str]
        List of subword tokens.
    """
    return len(set(token.lower() for token in tokens).intersection(WOI)) > 0


def get_embeddings(subword_tokens: BatchEncoding) -> Tensor:
    """Gets the embedding vector for each subwork token.
    
    Parameters
    ----------
    subword_tokens : BatchEncoding
        Subword tokens to get embeddings for.
        
    Returns
    -------
    Tensor
        Gets embedding vector for each subword token.    
    """
    return MODEL(**subword_tokens)[0][0]


def get_indices(tokens: list[str]) -> Iterator[dict[str, str | int]]:
    """Gets subword indices for the set of relevant words.
    
    Given the tokens:
    
    ["Demokratie", "und", "Globalisierung"]
    
    will result in the subword tokens:
    
    ["Demokratie", "und", "Global", "##isierung"]
    
    This function returns the subword token indices of the original tokens, i.e.:
    
    [{"token": "Demokratie", "index": 0}, {"token": "Globalisierung", "index": 2}]

    Paramters
    ---------
    tokens : list[str]
        List of tokens.
        
    Yields
    ------
    Iterator[dict[str, str | int]]
        One relevant token with respective subword token index at a time.
    """
    index = 0
    for token in tokens:
        if token.lower() in WOI:
            yield {"token": token, "index": index}
        index += len(SUBWORD_TOKENIZER.tokenize(token))


def filter_relevant_sentences(df: pd.DataFrame) -> list[str]:
    """Yields one releveant sentence (i.e. containin at least one WOI) at a time.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with one document per row.
        
    Yields
    ------
    Iterator[list[str]]
        One relevant sentence at a time.
    """
    for document in tqdm.tqdm(df.itertuples(), total=len(df), desc="Preprocessing"):
        for sentence in split_into_sentences(document.text):
            tokens = split_into_tokens(sentence)
            if is_relevant(tokens):
                yield document.year, sentence, tokens

def process_dataset(df: pd.DataFrame) -> dict[str, list[str]]:
    """Process the full data set, i.e. get sentences containing each WOI.
    
    Parameters
    ----------
    df : pd.DataFrame
        Data set.
        
    Returns
    -------
    dict[str, list[str]]
        Dictionary where the keys are the years and the values are lists of sentences containing the WOI.
    """
    results = defaultdict(list)
    sentences = list(filter_relevant_sentences(df))
    
    for year, sentence, tokens in tqdm.tqdm(sentences, desc="Filtering sentences"):
        results[year].append(sentence)
                
    return dict(results)

In [127]:
results = process_dataset(df)


Preprocessing: 100%|████████████████████████████████████████████████████████████████████| 69/69 [21:06<00:00, 18.35s/it]
Filtering sentences: 100%|██████████████████████████████████████████████████████████| 48/48 [00:00<00:00, 146312.93it/s]


In [119]:
print(results)

{1946.0: ['Seit dem Tage des Bastillesturms geht durch Europa der gleiche revolutionäre Schwung, der anderthalb Jahrzehnte vorher die nordamerikanischen Besitzungen der britischen Krone veranlaßte, die eigene Unabhängigkeit zu fordern.', 'Es mag auf den ersten Blick überraschen, daß zu den Vertretern dieses Frankreichs die Fortschrittlichen Katholiken, obgleich sie katholisch sind, und die Kommunisten, obgleich sie eine revolutionäre Partei sind, gerechnet werden.', 'Die Kommunisten sind zwar ebensosehr eine revolutionäre Partei wie die Fortschrittlichen Katholiken eine katholische, aber sie haben ihre Fernziele zurückgestellt gegenüber den Gegenwartsaufgaben des Wiederaufbaus.', 'Und eben damit waren diese Revolutionäre weit mehr als nur die Antipoden von Hitler und seinem unseligen System ihr Kampf ist darum neben der aktuellen Bedeutung für das Zeitgeschehen unserer Tage auf einer höheren Ebene der Versuch gewesen, das .', 'Er verachtete die Politiker, sei es die Parlamentarier in A

In [121]:
results.keys()

dict_keys([1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0, 1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0, 1958.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0])

In [128]:
df_text_tierschutz= pd.DataFrame(columns=["year", "text"])

for year, texts in results.items():
    if isinstance(texts, list):
        for text in texts:
            df_text_tierschutz = df_text_tierschutz.append({
                "year": year,
                "text": text
            }, ignore_index=True)


  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_tierschutz.append({
  df_text_tierschutz = df_text_

In [23]:
#safe_df= df_text_arbeitsschutz.to_csv("df_text_arbeitsschutz.csv", index=False)