In [None]:
# Variables

# Storage account name for the Synapse WS storage account - starts with "synsa"
storage_acct_name = ""
storage_container_name = "workspace"
storage_path_raw = "lab-data/raw/"

# Cosmos DB
cosmos_db_container_metadata = "metadata"

# Synapse linked service pointing to Cosmos DB Analytical Store - this is where we get the source data
synapse_cosmos_db_linked_service = "CosmosDbIoTLab"

StatementMeta(spark1, 30, 4, Submitted, Available)

In [None]:
vehicle_metadata_df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", synapse_cosmos_db_linked_service)\
    .option("spark.cosmos.container", cosmos_db_container_metadata)\
    .load()

StatementMeta(spark1, 30, 5, Finished, Available)

In [14]:
print(vehicle_metadata_df.count())

vehicle_metadata_df.printSchema()

StatementMeta(spark1, 30, 16, Finished, Available)

173707
root
 |-- _rid: string (nullable = true)
 |-- _ts: long (nullable = true)
 |-- id: string (nullable = true)
 |-- _etag: string (nullable = true)
 |-- partitionKey: string (nullable = true)
 |-- entityType: string (nullable = true)
 |-- vin: string (nullable = true)
 |-- lastServiceDate: string (nullable = true)
 |-- batteryAgeDays: long (nullable = true)
 |-- batteryRatedCycles: long (nullable = true)
 |-- lifetimeBatteryCyclesUsed: double (nullable = true)
 |-- averageDailyTripDuration: double (nullable = true)
 |-- batteryFailurePredicted: boolean (nullable = true)
 |-- stateVehicleRegistered: string (nullable = true)
 |-- customer: string (nullable = true)
 |-- description: integer (nullable = true)
 |-- status: string (nullable = true)
 |-- deliveryDueDate: string (nullable = true)
 |-- packages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- timestamp: string (nullable = true)
 |-- tripId: string (nullable = true)
 |-- consignmentId: string (nu

In [15]:
vehicle_metadata_df.createOrReplaceTempView("metadata")

StatementMeta(spark1, 30, 17, Finished, Available)



In [16]:
trips_clean_df = spark.sql("""
    SELECT  vin, 
            to_utc_timestamp(tripEnded, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\") as tripEnded, 
            to_utc_timestamp(tripStarted, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\") as tripStarted, 
            ((unix_timestamp(to_utc_timestamp(tripEnded, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\")) - 
                unix_timestamp(to_utc_timestamp(tripStarted, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\")))/60.0) as tripDurationMinutes
    FROM metadata
    WHERE entityType = 'Trip' AND status = 'Completed'
    """)

trips_clean_df.createOrReplaceTempView("trips_clean")

StatementMeta(spark1, 30, 18, Finished, Available)



In [17]:
print(trips_clean_df.count())

trips_clean_df.printSchema()

StatementMeta(spark1, 30, 19, Finished, Available)

110
root
 |-- vin: string (nullable = true)
 |-- tripEnded: timestamp (nullable = true)
 |-- tripStarted: timestamp (nullable = true)
 |-- tripDurationMinutes: decimal(27,6) (nullable = true)

In [18]:
vehicles_raw_df = spark.sql("""
    SELECT vin, batteryAgeDays, batteryRatedCycles, lifetimeBatteryCyclesUsed 
    FROM metadata 
    WHERE entityType ='Vehicle'
    """)

vehicles_raw_df.createOrReplaceTempView("vehicles_raw")

StatementMeta(spark1, 30, 20, Finished, Available)



In [19]:
print(vehicles_raw_df.count())

vehicles_raw_df.printSchema()

StatementMeta(spark1, 30, 21, Finished, Available)

1000
root
 |-- vin: string (nullable = true)
 |-- batteryAgeDays: long (nullable = true)
 |-- batteryRatedCycles: long (nullable = true)
 |-- lifetimeBatteryCyclesUsed: double (nullable = true)

In [20]:
# v.vin as vin, to_date(t.tripEnded, 'yyyy-MM-dd') as tripEnded,

batch_df = spark.sql("""
    SELECT  v.vin,
            v.batteryAgeDays, 
            v.batteryRatedCycles, 
            v.lifetimeBatteryCyclesUsed,
            t.tripDurationMinutes
    FROM    vehicles_raw v 
    INNER JOIN trips_clean t 
        ON v.vin = t.vin
    """)

StatementMeta(spark1, 30, 22, Finished, Available)



In [21]:
print(batch_df.count())

batch_df.printSchema()

StatementMeta(spark1, 30, 23, Finished, Available)

110
root
 |-- vin: string (nullable = true)
 |-- batteryAgeDays: long (nullable = true)
 |-- batteryRatedCycles: long (nullable = true)
 |-- lifetimeBatteryCyclesUsed: double (nullable = true)
 |-- tripDurationMinutes: decimal(27,6) (nullable = true)

In [22]:
batch_df.show()

StatementMeta(spark1, 30, 24, Finished, Available)

+-----------------+--------------+------------------+-------------------------+-------------------+
|              vin|batteryAgeDays|batteryRatedCycles|lifetimeBatteryCyclesUsed|tripDurationMinutes|
+-----------------+--------------+------------------+-------------------------+-------------------+
|T8DNDN5UDCWL7M72H|           112|               200|        12.56852760294626|          19.750000|
|0ZGVI20GIS84M1B4D|           664|               200|        73.11092063018555|          33.433333|
|9K3NPUOHFCGGMDO9G|           275|               200|       29.630602915264014|          20.816667|
|V5V483U7H0713ZAQ0|           814|               200|       103.54608638921663|          23.866667|
|ZUM234INX6MPOJ1D6|          1071|               200|       123.67412813528725|          36.950000|
|5FY0WL5EQPVD9LKYW|          1276|               200|       215.44704130502672|          36.250000|
|9ZMA2FR6LFGXAB4YJ|           361|               200|        56.91020845529577|          20.116667|


In [23]:
sa_uri = "abfss://workspace@" + storage_acct_name + ".dfs.core.windows.net/" + storage_path_raw

batch_df.write.mode('overwrite').parquet(sa_uri)

StatementMeta(spark1, 30, 25, Finished, Available)

