In [None]:
# Variables

# Storage account name for the Synapse WS storage account - starts with "synsa"
storage_acct_name = "synsa3uka6z6tqjlmi"
storage_container_name = "workspace"
storage_path_scored = "lab-data/scored/"

# Cosmos DB
cosmos_db_database = "ContosoAuto"
cosmos_db_container_metadata = "metadata"
cosmos_db_container_maintenance = "maintenance"

# Synapse linked service pointing to Cosmos DB Analytical Store - this is where we get the source data
synapse_cosmos_db_linked_service = "CosmosDbIoTLab"

StatementMeta(spark1, 31, 5, Submitted, Available)

In [None]:
sa_uri = "abfss://workspace@" + storage_acct_name + ".dfs.core.windows.net/" + storage_path_scored

StatementMeta(spark1, 31, 6, Submitted, Available)

In [6]:
scored_maintenance_df = spark.read.parquet(sa_uri)

StatementMeta(spark1, 31, 8, Finished, Available)



In [7]:
print(scored_maintenance_df.count())

scored_maintenance_df.printSchema()

StatementMeta(spark1, 31, 9, Finished, Available)

110
root
 |-- vin: string (nullable = true)
 |-- batteryAgeDays: long (nullable = true)
 |-- batteryRatedCycles: long (nullable = true)
 |-- lifetimeBatteryCyclesUsed: double (nullable = true)
 |-- tripDurationMinutes: double (nullable = true)
 |-- result: long (nullable = true)

In [8]:
scored_maintenance_df.show()

StatementMeta(spark1, 31, 10, Finished, Available)

+-----------------+--------------+------------------+-------------------------+-------------------+------+
|              vin|batteryAgeDays|batteryRatedCycles|lifetimeBatteryCyclesUsed|tripDurationMinutes|result|
+-----------------+--------------+------------------+-------------------------+-------------------+------+
|3OQE2DL51SU3H6SOA|           572|               200|        99.84783502151362|          40.333333|     0|
|NMGCY7X5J1PJ2UA94|           428|               200|        76.34864138288938|          19.233333|     0|
|F0MHIDSCCW3U5GYLM|            35|               200|        3.777722508852241|          13.666667|     0|
|C1L2DIPNXT6NMTY0Y|           240|               200|        43.90140376332281|          34.566667|     0|
|HVL5A6XYMSSLTJOGX|           799|               200|       134.34802319883033|          25.216667|     0|
|E0IFFYJHBQ9KGBL26|            40|               200|        7.037303141905602|          29.833333|     0|
|GN363ZCEXYXJDESOH|           578|   

# Write scored metadata back to Cosmos DB maintenance container

In [None]:
# Retrieve connection string and key from linked service
import sys
import re

from pyspark.sql import SparkSession
sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary

connection_string = token_library.getConnectionString(synapse_cosmos_db_linked_service)
matchObj = re.match( r'AccountEndpoint=(.*);Database=(.*);AccountKey="(.*)";', connection_string, re.M|re.I)
endpoint = matchObj.group(1)
masterkey = matchObj.group(3)

In [None]:
read_config_maintenance = {
    "Endpoint" : endpoint,
    "Masterkey" : masterkey,
    "Database" : cosmos_db_database,
    "Collection" : cosmos_db_container_maintenance
}

In [None]:
# Read existing maintenance records (if any)

existing_maintenance_df = spark.read.format("com.microsoft.azure.cosmosdb.spark").options(**read_config_maintenance).load()

In [None]:
print(existing_maintenance_df.count())

existing_maintenance_df.show()

In [None]:
# If we had existing maintenance records from Cosmos DB, let's join them to the batch predictions on VIN. This is so we get the Cosmos DB-assigned
# unique id on each document, and can do an update instead of a redundant insert for the same VIN.
# If there are no maintenance records, we do not join, so we will not pass an id field, which means Cosmos DB will auto-generate it and insert it with the new document.

if existing_maintenance_df.count() > 0:
    maintenance_records_to_write_df = scored_maintenance_df\
        .join(existing_maintenance_df, scored_maintenance_df.vin == existing_maintenance_df.vin)\
        .select(scored_maintenance_df["*"], existing_maintenance_df["id"])
else:
    maintenance_records_to_write_df = scored_maintenance_df

In [None]:
print(maintenance_records_to_write_df.count())

maintenance_records_to_write_df.show()

In [None]:
write_config_maintenance = {
    "Endpoint": endpoint,
    "Masterkey": masterkey,
    "Database": cosmos_db_database,
    "Collection": cosmos_db_container_maintenance,
    "Upsert": "true"
}

In [None]:
maintenance_records_to_write_df.write.mode("overwrite").format("com.microsoft.azure.cosmosdb.spark").options(**write_config_maintenance).save()