In [0]:
from presidio_analyzer import AnalyzerEngine  
from presidio_anonymizer import AnonymizerEngine  
from presidio_anonymizer.entities import OperatorConfig  
from pyspark.sql import SparkSession  
from pyspark.sql.functions import pandas_udf, StringType  
import pandas as pd  
import spacy  
  
spacy.load('en_core_web_lg')  
  
analyzer = AnalyzerEngine()  
anonymizer = AnonymizerEngine()  
  
# broadcast the engines to the cluster nodes  
spark = SparkSession.builder.getOrCreate()  
sc = spark.sparkContext  
broadcasted_analyzer = sc.broadcast(analyzer)  
broadcasted_anonymizer = sc.broadcast(anonymizer)  
  
# define a pandas UDF function and a series function over it.  
def anonymize_text(text: str) -> str:  
    analyzer = broadcasted_analyzer.value  
    anonymizer = broadcasted_anonymizer.value  
    analyzer_results = analyzer.analyze(text=text, language="en")  
    anonymized_results = anonymizer.anonymize(  
        text=text,  
        analyzer_results=analyzer_results,  
        operators={  
            "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})  
        },  
    )  
    return anonymized_results.text  
  
@pandas_udf(StringType())  
def anonymize_series(s: pd.Series) -> pd.Series:  
    return s.apply(anonymize_text)  
  
# Example usage with a DataFrame  
#data = [("John Doe",), ("Jane Smith",)]  
#columns = ["name"]  
#df = spark.createDataFrame(data, columns)  
  
# Apply the anonymize_series function to the 'name' column  
#df = df.withColumn("anonymized_name", anonymize_series(df["name"]))  
#df.show()  




+----------+---------------+
|      name|anonymized_name|
+----------+---------------+
|  John Doe|   <ANONYMIZED>|
|Jane Smith|   <ANONYMIZED>|
+----------+---------------+

