In [None]:
# Variables

# Storage account name for the Synapse WS storage account - starts with "synsa"
storage_acct_name = ""
storage_container_name = "workspace"
storage_path_raw = "lab-data/raw/"

# Cosmos DB
cosmos_db_container_metadata = "metadata"

# Synapse linked service pointing to Cosmos DB Analytical Store - this is where we get the source data
synapse_cosmos_db_linked_service = "CosmosDbIoTLab"

In [None]:
vehicle_metadata_df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", synapse_cosmos_db_linked_service)\
    .option("spark.cosmos.container", cosmos_db_container_metadata)\
    .load()

In [None]:
print(vehicle_metadata_df.count())

vehicle_metadata_df.printSchema()

In [None]:
vehicle_metadata_df.createOrReplaceTempView("metadata")

In [None]:
trips_clean_df = spark.sql("""
    SELECT  vin, 
            to_utc_timestamp(tripEnded, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\") as tripEnded, 
            to_utc_timestamp(tripStarted, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\") as tripStarted, 
            ((unix_timestamp(to_utc_timestamp(tripEnded, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\")) - 
                unix_timestamp(to_utc_timestamp(tripStarted, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\")))/60.0) as tripDurationMinutes
    FROM metadata
    WHERE entityType = 'Trip' AND status = 'Completed'
    """)

trips_clean_df.createOrReplaceTempView("trips_clean")

In [None]:
print(trips_clean_df.count())

trips_clean_df.printSchema()

In [None]:
vehicles_raw_df = spark.sql("""
    SELECT vin, batteryAgeDays, batteryRatedCycles, lifetimeBatteryCyclesUsed 
    FROM metadata 
    WHERE entityType ='Vehicle'
    """)

vehicles_raw_df.createOrReplaceTempView("vehicles_raw")

In [None]:
print(vehicles_raw_df.count())

vehicles_raw_df.printSchema()

In [None]:
batch_df = spark.sql("""
    SELECT  v.vin,
            v.batteryAgeDays, 
            v.batteryRatedCycles, 
            v.lifetimeBatteryCyclesUsed,
            t.tripDurationMinutes
    FROM    vehicles_raw v 
    INNER JOIN trips_clean t 
        ON v.vin = t.vin
    """)

In [None]:
print(batch_df.count())

batch_df.printSchema()

In [None]:
batch_df.show()

In [None]:
sa_uri = "abfss://workspace@" + storage_acct_name + ".dfs.core.windows.net/" + storage_path_raw

batch_df.write.mode('overwrite').parquet(sa_uri)