In [None]:
# Variables

# Storage account name for the Synapse WS storage account - starts with "synsa"
storage_acct_name = ""
storage_container_name = "workspace"
storage_path_scored = "lab-data/scored/"

# Cosmos DB
cosmos_db_database = "ContosoAuto"
cosmos_db_container_maintenance = "maintenance"

# Synapse linked service pointing to Cosmos DB Analytical Store - this is where we get the source data
synapse_cosmos_db_linked_service = "CosmosDbIoTLab"

In [None]:
sa_uri = "abfss://workspace@" + storage_acct_name + ".dfs.core.windows.net/" + storage_path_scored

In [None]:
scored_maintenance_df = spark.read.parquet(sa_uri)

In [None]:
print(scored_maintenance_df.count())

scored_maintenance_df.printSchema()

In [None]:
scored_maintenance_df.show()

# Write scored maintenance data back to Cosmos DB maintenance container

In [None]:
# Retrieve connection string and key from linked service
import sys
import re

from pyspark.sql import SparkSession
sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary

connection_string = token_library.getConnectionString(synapse_cosmos_db_linked_service)
matchObj = re.match( r'AccountEndpoint=(.*);Database=(.*);AccountKey="(.*)";', connection_string, re.M|re.I)
endpoint = matchObj.group(1)
masterkey = matchObj.group(3)

In [None]:
read_config_maintenance = {
    "Endpoint" : endpoint,
    "Masterkey" : masterkey,
    "Database" : cosmos_db_database,
    "Collection" : cosmos_db_container_maintenance
}

In [None]:
# Read existing maintenance records (if any)

existing_maintenance_df = spark.read.format("com.microsoft.azure.cosmosdb.spark").options(**read_config_maintenance).load()

In [None]:
print(existing_maintenance_df.count())

existing_maintenance_df.show()

In [None]:
# If we had existing maintenance records from Cosmos DB, let's join them to the batch predictions on VIN. This is so we get the Cosmos DB-assigned
# unique id on each document, and can do an update instead of a redundant insert for the same VIN.
# If there are no maintenance records, we do not join, so we will not pass an id field, which means Cosmos DB will auto-generate it and insert it with the new document.

if existing_maintenance_df.count() > 0:
    maintenance_records_to_write_df = scored_maintenance_df\
        .join(existing_maintenance_df, scored_maintenance_df.vin == existing_maintenance_df.vin)\
        .select(scored_maintenance_df["*"], existing_maintenance_df["id"])
else:
    maintenance_records_to_write_df = scored_maintenance_df

In [None]:
print(maintenance_records_to_write_df.count())

maintenance_records_to_write_df.show()

In [None]:
write_config_maintenance = {
    "Endpoint": endpoint,
    "Masterkey": masterkey,
    "Database": cosmos_db_database,
    "Collection": cosmos_db_container_maintenance,
    "Upsert": "true"
}

In [None]:
maintenance_records_to_write_df.write.mode("overwrite").format("com.microsoft.azure.cosmosdb.spark").options(**write_config_maintenance).save()