In [1]:
import re
from collections import defaultdict
from typing import Iterator

import numpy as np
import pandas as pd
import syntok.segmenter as segmenter
import tqdm
import umap.plot
from optimum.onnxruntime import ORTModelForFeatureExtraction
from torch import Tensor
from transformers import AutoTokenizer, pipeline
from transformers.tokenization_utils_base import BatchEncoding
from umap import UMAP

In [2]:
df = pd.read_csv("df_cleaned_with_punc_years.csv")
df.sample(10)

Unnamed: 0.1,Unnamed: 0,year,text
24,24,1970.0,Von Wolfgang Löhde Der Deckname dieses Falles...
7,7,1953.0,Von Martin Rabe Im Lüneburger Museum hat man d...
33,33,1979.0,Darin sind sich Zuschauer und Kritik erstaunli...
67,67,2013.0,"Die Weltgemeinschaft stellt , Milliarden Dolla..."
10,10,1956.0,"Das Montageband von Fichtel Sachs, Schweinfur..."
47,47,1993.0,Von Dietrich Willier UlmSarajevo Mißtrauisch s...
25,25,1971.0,Nikita Sergejewitsch Chruschtschow war groß im...
31,31,1977.0,von Dieter Buhl Auf der politischen Bühne des...
38,38,1984.0,Von Hans Schueler Der ZEITRedakeur Michael Sch...
48,48,1994.0,Von Werner A. Perger WarschauBonn Dies würde e...


In [21]:
MODEL_NAME = "bert-base-german-cased"

SUBWORD_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
MODEL = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, from_transformers=True)

TOKEN_PATTERN = re.compile(r"\w+")

# insert word(s) of interest here

WOI = {
 #"diktatur",
 # "demokratischer",
 # "demokratisches",
 # "diktatur",
 # "extremismus",
 # "freiheit",
 # "föderalismus",
 # "gerechtigkeit",
 # "globalisierung",
 # "grundeinkommen",
 # "ideologie",
 # "imperialismus",
 # "kapitalismus",
 # "klassenkampf",
 # "kommunismus",
 # "liberalismus",
 # "marktwirtschaft",
 # "nationalismus",
 # "pluralismus",
 # "rechtsstaat",
 # "revolutionäre",
 # "solidarität",
 # "sozialismus",
 # "sozialstaat",
 # "toleranz"
 # "natur"
 # "republikanismus"
 # "feindbild"
 # "neonazismus"
 #  "aktivismus" 
 # ...............   
}

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
def split_into_sentences(text: str) -> Iterator[str]:
    """Split the given text into sentences.
    
    Parameters
    ----------
    text : str
        Text to split into sentences.
        
    Yields
    ------
    Iterator[str]
        One sentence at a time.
    """
    for paragraph in segmenter.process(text):
        for sentence in paragraph:
            yield "".join(str(token) for token in sentence).strip()


def split_into_tokens(text: str) -> list[str]:
    """Split the given text into tokens.
    
    Parameters
    ----------
    text : str
        Text to split into tokens.
        
    Returns
    -------
    list[str]
        List of tokens.
    """
    return TOKEN_PATTERN.findall(text)


def split_into_subword_tokens(tokens: list[str]) -> BatchEncoding:
    """Split the given tokens into subword tokens.
    
    Parameters
    ----------
    tokens : list[str]
        Tokens to split into subword tokens.
        
    Returns
    -------
    BatchEncoding
        Subword tokens.
    """
    return SUBWORD_TOKENIZER(tokens, is_split_into_words=True, return_tensors="pt")


def is_relevant(tokens: list[str]) -> bool:
    """Checks if the given tokens are relevant, i.e. contain at least one WOI.
    
    Parameters
    ----------
    tokens : list[str]
        List of tokens to check.
        
    Returns
    -------
    list[str]
        List of subword tokens.
    """
    return len(set(token.lower() for token in tokens).intersection(WOI)) > 0


def get_embeddings(subword_tokens: BatchEncoding) -> Tensor:
    """Gets the embedding vector for each subwork token.
    
    Parameters
    ----------
    subword_tokens : BatchEncoding
        Subword tokens to get embeddings for.
        
    Returns
    -------
    Tensor
        Gets embedding vector for each subword token.    
    """
    return MODEL(**subword_tokens)[0][0]


def get_indices(tokens: list[str]) -> Iterator[dict[str, str | int]]:
    """Gets subword indices for the set of relevant words.
    
    Given the tokens:
    
    ["Demokratie", "und", "Globalisierung"]
    
    will result in the subword tokens:
    
    ["Demokratie", "und", "Global", "##isierung"]
    
    This function returns the subword token indices of the original tokens, i.e.:
    
    [{"token": "Demokratie", "index": 0}, {"token": "Globalisierung", "index": 2}]

    Paramters
    ---------
    tokens : list[str]
        List of tokens.
        
    Yields
    ------
    Iterator[dict[str, str | int]]
        One relevant token with respective subword token index at a time.
    """
    index = 0
    for token in tokens:
        if token.lower() in WOI:
            yield {"token": token, "index": index}
        index += len(SUBWORD_TOKENIZER.tokenize(token))


def filter_relevant_sentences(df: pd.DataFrame) -> list[str]:
    """Yields one releveant sentence (i.e. containin at least one WOI) at a time.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with one document per row.
        
    Yields
    ------
    Iterator[list[str]]
        One relevant sentence at a time.
    """
    for document in tqdm.tqdm(df.itertuples(), total=len(df), desc="Preprocessing"):
        for sentence in split_into_sentences(document.text):
            tokens = split_into_tokens(sentence)
            if is_relevant(tokens):
                yield document.year, sentence, tokens
                
                
def process_dataset(df: pd.DataFrame) -> list[dict[str, str | int | list[float]]]:
    """Process the full data set, i.e. get embeddings for each WOI.
    
    Parameters
    ----------
    df : pd.DataFrame
        Data set.
        
    Returns
    -------
    list[dict[str, str | int | list[float]]]
        List of vectors and metadata.
    """
    results = defaultdict(list)
    sentences = list(filter_relevant_sentences(df))
    
    for year, sentence, tokens in tqdm.tqdm(sentences, desc="Vectorizing"):
        try:
            subword_tokens = split_into_subword_tokens(tokens)
            vectors = get_embeddings(subword_tokens)

            indices = list(get_indices(tokens))
            relevant_vectors = vectors[[token["index"] for token in indices]]

            for token, vector in zip(indices, relevant_vectors):
                results[token["token"]].append({"year": year, "text": sentence, "vector": vector.tolist()})
        except Exception as error:
            print(error)
                
    return results

In [23]:
results = process_dataset(df)

Preprocessing: 100%|████████████████████████████████████████████████████████████████████| 69/69 [21:41<00:00, 18.86s/it]
Vectorizing: 100%|██████████████████████████████████████████████████████████████████████| 80/80 [00:11<00:00,  6.83it/s]


In [24]:
df_vecs_aktivismus= pd.DataFrame(columns=["year", "word", "vector"])

for word, word_data in results.items():
    for i in range(len(word_data)):
        df_vecs_aktivismus = df_vecs_aktivismus.append({
            "year": word_data[i]["year"],
            "word": word,
            "vector": word_data[i]["vector"]
        }, ignore_index=True)

  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_aktivismus.append({
  df_vecs_aktivismus = df_vecs_

In [25]:
df_vecs_aktivismus

Unnamed: 0,year,word,vector
0,1946.0,Aktivismus,"[0.3107272982597351, -0.3311808109283447, 0.17..."
1,1946.0,Aktivismus,"[-0.5113986134529114, -0.3631627559661865, 0.5..."
2,1947.0,Aktivismus,"[0.6253553628921509, -0.8608283996582031, -1.0..."
3,1947.0,Aktivismus,"[-1.0096851587295532, -0.4223272204399109, -0...."
4,1949.0,Aktivismus,"[-0.41684240102767944, -0.04460939019918442, 0..."
...,...,...,...
75,2012.0,Aktivismus,"[0.23302540183067322, -0.141241192817688, -0.0..."
76,2012.0,Aktivismus,"[0.2725354731082916, -0.10626809298992157, 0.4..."
77,2013.0,Aktivismus,"[0.46819013357162476, -0.718204915523529, -0.1..."
78,2013.0,Aktivismus,"[0.6829668879508972, -0.9057010412216187, -0.1..."


In [26]:
#df_safe= df_vecs_aktivismus.to_csv("df_vecs_aktivismus.csv", index=False)