In [245]:
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
!python -m spacy download en_core_web_md
import numpy as np
import pandas as pd

You should consider upgrading via the '/Users/dominik/Documents/Masterarbeit-Code/nlp-satisfaction/venv/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [309]:
## Import dataframe
filelocation = 'data/DataClean'
df = pd.read_feather(filelocation)
df = df.head(1000)

In [310]:
df = df[df["Kommentar"].notnull()]
df = df[["participant_id","Kommentar"]]

In [311]:
df

Unnamed: 0,participant_id,Kommentar
5,41305,"Habe schon mehrmals erlebt, dass es im Speisew..."
6,41334,"Ansteben, dass auch in gut frequentierte perip..."
9,41459,Die 1. Klasse muss deutluch aufgewertet werden...
10,41497,Bessere (neue!) Züge in der Westschweiz!!! Län...
11,41536,- mehr Monitore - die Monitore so platzieren...
...,...,...
988,51761,"Besser informieren in Landquart, was das zu be..."
990,51765,"Unbedingt Züge kaufen, die auch von Reisenden ..."
991,51766,"Preise senken, der Rest ist gut."
995,51773,Ich habe nie geschaft meine monates ABO zu ern...


In [306]:
# create a small test df by hand
d = {'id': [1, 2,3,4,5,6], 'Kommentar': ["hallo mein Name ist Dominik und meine email dominik@web.de", "meldet euch +41 79 290 53 02","keine info","Der Zugbegleiter Felix Böwing war sehr unfreundlich","Die SBB hat es mal wieder verkackt"," Bitte überweisen sie das Geld an AL35202111090000000001234567"]}
df  = pd.DataFrame(data=d)

In [257]:
#comments = list(df.Kommentar)
#comments = comments[0:1000]

In [258]:
def anonymizer_de(list):
    
    anonymized_text_list = []
    
    #Create configuration containing engine name and models
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "de", "model_name": "de_core_news_lg"}],}
    
    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()
    
    # the languages are needed to load country-specific recognizers 
    # # for finding phones, passport numbers, etc.
    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                              supported_languages=["de"])
    
    for comment in list:
        if isinstance(comment, str):
            results = analyzer.analyze(text=comment,
                           language='de',entities=["PERSON","EMAIL_ADDRESS","PHONE_NUMBER","CREDIT_CARD","IBAN_CODE"])
            
            anonymizer = AnonymizerEngine()
            anonymized_text = anonymizer.anonymize(text=comment, analyzer_results=results).text
            anonymized_text_list.append(anonymized_text)
        else:
            anonymized_text = None
            anonymized_text_list.append(anonymized_text)
            
            
        
    return anonymized_text_list

In [238]:
def find_extract_ner_entities_list(list,entitie):
    
    ### This function takes a list of strings and one NER entitie (e.g."EMAIL_ADDRESS","PHONE_NUMER",...)as input.It outputs the detected NER Enteties as a List and if nothing was found inserts NONE.
    
    results_list = []
    
    #Create configuration containing engine name and models
    configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "de", "model_name": "de_core_news_lg"}],}
    
    # Create NLP engine based on configuration
    provider = NlpEngineProvider(nlp_configuration=configuration)
    nlp_engine = provider.create_engine()
    
    # the languages are needed to load country-specific recognizers 
    # # for finding phones, passport numbers, etc.
    analyzer = AnalyzerEngine(nlp_engine=nlp_engine,
                              supported_languages=["de"])
    
    for comment in list:
        if isinstance(comment, str):
            
            results = analyzer.analyze(text=comment,
                           language='de', entities=[entitie]
                           )
            
            detected_entities = [(comment[res.start:res.end]) for res in results]
            results_list.append(detected_entities)
            
        else:
            detected_entities = None
            results_list.append(detected_entities)
            
    # Replace empty List from List with None using list comprehension
    results_list = [None if not x else x for x in results_list]

    return results_list

In [239]:
def check_column_values(df, col1, col2):
    ### This function takes a dataframe and two column names as input. It checks if one of the two columns contains an information (eg. email and phone_number) and returns TRUE or FALSE.
    col1_values = df[col1].tolist()
    col2_values = df[col2].tolist()
    
    result = []
    
    for i in range(len(col1_values)):
        if col1_values[i] != None or col2_values[i] != None:
            result.append(True)
        
        else:
            result.append(False)
        
    return result

In [312]:
# Anonimize Kommentare as new column
df["Kommentar_anonymised"] = anonymizer_de(df.Kommentar)

# Add Phone number and email as new column
df["email_address"]=find_extract_ner_entities_list(df.Kommentar, "EMAIL_ADDRESS")
df["phone_number"]=find_extract_ner_entities_list(df.Kommentar, "PHONE_NUMBER")


# Add personal information column (TRUE or FALSE)
df["has_contact_details"]=check_column_values(df,'phone_number','email_address')

In [313]:
df

Unnamed: 0,participant_id,Kommentar,Kommentar_anonymised,email_address,phone_number,orte,has_contact_details
5,41305,"Habe schon mehrmals erlebt, dass es im Speisew...","Habe schon mehrmals erlebt, dass es im Speisew...",,,,False
6,41334,"Ansteben, dass auch in gut frequentierte perip...","Ansteben, dass auch in gut frequentierte perip...",,,,False
9,41459,Die 1. Klasse muss deutluch aufgewertet werden...,Die 1. Klasse muss deutluch aufgewertet werden...,,,"[Zürich, Altstetten]",False
10,41497,Bessere (neue!) Züge in der Westschweiz!!! Län...,Bessere (neue!) Züge in der Westschweiz!!! Län...,,,"[Bern, Genève]",False
11,41536,- mehr Monitore - die Monitore so platzieren...,- mehr Monitore - die Monitore so platzieren...,,,,False
...,...,...,...,...,...,...,...
988,51761,"Besser informieren in Landquart, was das zu be...","Besser informieren in Landquart, was das zu be...",,,,False
990,51765,"Unbedingt Züge kaufen, die auch von Reisenden ...","Unbedingt Züge kaufen, die auch von Reisenden ...",,,,False
991,51766,"Preise senken, der Rest ist gut.","Preise senken, der Rest ist gut.",,,,False
995,51773,Ich habe nie geschaft meine monates ABO zu ern...,Ich habe nie geschaft meine monates ABO zu ern...,,,,False


In [305]:
df.to_csv('anomnymisation_test.csv')