In [1]:
# Import relevant general libraries.
import re
import pandas as pd

# Spacy imports for NER.
import spacy

In [2]:
# Load Spacy large NL model and set a higher max_length.
model = spacy.load("nl_core_news_lg")
model.max_length = 10000000 

In [3]:
df = pd.read_csv('kamers_text.csv')

In [5]:
# Omit rows with missing text or date
df = df[df['date'].notna()]
df = df[df['text'].notna()]

In [4]:
# Sort dataset by date if it wasn't already.
df = df.sort_values(by='date')

In [5]:
# Function to remove unwanted characters.
def remove_unwanted_chars(text):

    # Replace \n, \t, and \r with a space
    cleaned_text = re.sub(r"[\n\t\r]", " ", str(text))
    
    # Remove the extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)
    return cleaned_text.strip()

# Apply the text-cleaning function to the text column of the dataset.
df["text"] = df["text"].apply(remove_unwanted_chars)

In [6]:
# Function to retrieve entities from textual data, using our SpaCy model.
def get_entities(text):
    doc = model(str(text))
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

In [None]:
# Apply the entity-tagging function to the 'text' column of our dataset, store in new column 'entities'.
df['entities'] = df['text'].apply(get_entities)

In [9]:
# Save entity-tagged dataset.
df.to_csv('kamers_text_tagged.csv')