In [340]:
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
!python -m spacy download en_core_web_md
import numpy as np
import pandas as pd

You should consider upgrading via the '/Users/dominik/Documents/Masterarbeit-Code/nlp-satisfaction/venv/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [360]:
## Import dataframe
filelocation = 'data/DataClean'
df = pd.read_feather(filelocation)
#df = df.head(1000)

In [361]:
df = df[df["Kommentar"].notnull()]
df = df[["participant_id","Kommentar"]]

In [363]:
df.head(4)

Unnamed: 0,participant_id,Kommentar
5,41305,"Habe schon mehrmals erlebt, dass es im Speisew..."
6,41334,"Ansteben, dass auch in gut frequentierte perip..."
9,41459,Die 1. Klasse muss deutluch aufgewertet werden...
10,41497,Bessere (neue!) Züge in der Westschweiz!!! Län...


In [354]:
# create a small test df by hand
#d = {'id': [1, 2,3,4,5,6], 'Kommentar': ["hallo mein Name ist Dominik und meine email dominik@web.de", "meldet euch +41 79 290 53 02","keine info","Der Zugbegleiter Felix Böwing war sehr unfreundlich","Die SBB hat es mal wieder verkackt"," Bitte überweisen sie das Geld an AL35202111090000000001234567"]}
#df  = pd.DataFrame(data=d)

In [364]:
def anonymizer_de(list):
    """
    Anonymize a list of text comments.
    
    This function uses a natural language processing (NLP) engine and an anonymizer engine to identify and anonymize personal information in a list of text comments. The function supports German language comments, and can recognize entities such as person names, email addresses, phone numbers, credit card numbers, and IBAN codes.
    
    Args:
        list: A list of strings containing text comments to be anonymized.
    
    Returns:
        A list of strings containing the anonymized text comments.
    """

    
    
    #Create configuration containing engine name and models
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "de", "model_name": "de_core_news_lg"}],}
    
    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()
    
    # the languages are needed to load country-specific recognizers 
    # # for finding phones, passport numbers, etc.
    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                              supported_languages=["de"])

    # Create the anonymizer
    anonymizer = AnonymizerEngine()

    # Anonymize each comment in the list
    anonymized_text_list = [
        anonymizer.anonymize(text=comment,
                             analyzer_results=analyzer.analyze(text=comment,
                                                              language='de',
                                                              entities=["PERSON","EMAIL_ADDRESS","PHONE_NUMBER","CREDIT_CARD","IBAN_CODE"])).text
        if isinstance(comment, str) else None
        for comment in list
    ]
    
    return anonymized_text_list

In [365]:
def find_extract_ner_entities_list(lst, entitie):
    """
    Find and extract named entities from a list of strings using natural language processing.
    
    This function uses a specified NLP engine and language model to detect named entities in a list of strings,
    and returns a list of the detected entities. If the input is not a string, None is returned instead.
    
    Args:
        lst: A list of strings to process.
        entitie: The named entity type to look for (e.g. "person", "organization", "location", etc.)
        
    Returns:
        A list of detected entities, or None if the input is not a string.
    """

    # Create configuration containing engine name and models
    configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "de", "model_name": "de_core_news_lg"}],
    }
    
    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()
    
    # the languages are needed to load country-specific recognizers 
    # # for finding phones, passport numbers, etc.
    analyzer = AnalyzerEngine(nlp_engine=nlp_engine, supported_languages=["de"])
    
    # Create a list comprehension that processes each string in the input list and
    # appends the detected entities to the results list. If the input is not a string,
    # None is appended to the results list.
    results_list = [
        None if not isinstance(comment, str)
        else [(comment[res.start:res.end]) for res in analyzer.analyze(text=comment, language="de", entities=[entitie])]
        for comment in lst
    ]

    # Replace empty lists in the results list with None.
    results_list = [None if not x else x for x in results_list]

    return results_list

In [366]:
def check_column_values(df, col1, col2):
    # Check if either of the two columns contains a non-null value
    result = (df[col1].notnull() | df[col2].notnull()).tolist()
    return result


In [367]:
# Anonimize Kommentare as new column
df["Kommentar_anonymised"] = anonymizer_de(df.Kommentar)

# Add Phone number and email as new column
df["email_address"]=find_extract_ner_entities_list(df.Kommentar, "EMAIL_ADDRESS")
df["phone_number"]=find_extract_ner_entities_list(df.Kommentar, "PHONE_NUMBER")


# Add personal information column (TRUE or FALSE)
df["has_contact_details"]=check_column_values(df,'phone_number','email_address')

In [305]:
df.to_csv('anomnymisation_test.csv')