In [0]:
%sql
SELECT * FROM samples.nyctaxi.trips

In [0]:
spark.read.table("samples.nyctaxi.trips").limit(10).display()

In [0]:
%sql
use catalog alpha_cloud_ai_workspace;

create table if not exists default.department
(
  deptcode INT,
  deptname STRING,
  location STRING
);

insert into default.department values
(10,'FINANCE', 'EDINBURGH'),
(20,'HR', 'LONDON');

In [0]:
dbutils.fs.ls('/databricks-datasets')

In [0]:
#Libraries
from pyspark.sql.functions import col, current_timestamp

#variables
file_path = "/databricks-datasets/structured-streaming/events"
table_name = "sylvain_etl_quickstart"
checkpoint_path = "/tmp/sylvain/_checkpoint/etl_quickstart"

#Clear data from previous run
spark.sql(f"DROP TABLE IF EXISTS {table_name}")
dbutils.fs.rm(checkpoint_path, True)

#Configure autoloader
(
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json").option("cloudFiles.schemaLocation", checkpoint_path)
    .load(file_path)
    .select("*",col("_metadata.file_path").alias("source_file"),current_timestamp().alias("processing_time"))
    .writeStream
    .option("checkpointLocation", checkpoint_path)
    .trigger(availableNow=True)
    .toTable(table_name)
)


In [0]:
df = spark.read.table(table_name)
display(df)


In [0]:
catalog = "alpha_cloud_ai_workspace"
schema = "default"
volume = "<volume_name>"
download_url = "https://health.data.ny.gov/api/views/jxy9-yhdk/rows.csv"
file_name = "baby_names.csv"
table_name = "baby_names"
path_volume = "/Volumes/" + catalog + "/" + schema + "/" + volume
path_table = catalog + "." + schema

###Natural Language Processing(NLP) & Anonymization

In [0]:
pip install presidio-analyzer
pip install presidio-anonymizer
python -m spacy download en_core_web_lg

In [0]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine

anonymized_column = "value" # name of column to anonymize
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
deanonymizer = DeanonymizeEngine()

# broadcast the engines to the cluster nodes
broadcasted_analyzer = sc.broadcast(analyzer)
broadcasted_anonymizer = sc.broadcast(anonymizer)
broadcasted_deanonymizer = sc.broadcast(deanonymizer)


