In [24]:
# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "green"
blob_sas_token = r""

# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

# SPARK read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)
print('Register the DataFrame as a SQL temporary view: source')
df.createOrReplaceTempView('source')

# Display the schema of the DataFrame
df.printSchema

StatementMeta(, c31f16b3-4830-4e36-87c6-45a42cf958ea, 27, Finished, Available)

Remote blob path: wasbs://nyctlc@azureopendatastorage.blob.core.windows.net/green
Register the DataFrame as a SQL temporary view: source
Displaying top 10 rows: 


<bound method DataFrame.printSchema of DataFrame[vendorID: int, lpepPickupDatetime: timestamp, lpepDropoffDatetime: timestamp, passengerCount: int, tripDistance: double, puLocationId: string, doLocationId: string, pickupLongitude: double, pickupLatitude: double, dropoffLongitude: double, dropoffLatitude: double, rateCodeID: int, storeAndFwdFlag: string, paymentType: int, fareAmount: double, extra: double, mtaTax: double, improvementSurcharge: string, tipAmount: double, tollsAmount: double, ehailFee: double, totalAmount: double, tripType: int, puYear: int, puMonth: int]>

In [29]:
# The targets where this data has to be written
import trident_token_library_wrapper
kustoUri = "https://<cluster>.kusto.data.microsoft.com"
database="Stocks"
table="GreenTaxiData"

StatementMeta(, c31f16b3-4830-4e36-87c6-45a42cf958ea, 32, Finished, Available)

In [31]:
#This is an example of writing data to Kusto. The source data is read as a blob into a dataframe from Azure Open Data for GreenTaxi / Limousines in NYC.
# The access token in the current case is the user's credential that is used to write the data to the GreenTaxiData table in Kusto.
df.write.format("com.microsoft.kusto.spark.synapse.datasource").\
option("kustoCluster",kustoUri).\
option("kustoDatabase",database).\
option("kustoTable", table).\
option("accessToken", trident_token_library_wrapper.PyTridentTokenLibrary.get_access_token(kustoUri)).\
option("tableCreateOptions", "CreateIfNotExist").mode("Append").save()

StatementMeta(, c31f16b3-4830-4e36-87c6-45a42cf958ea, 34, Finished, Available)

In [34]:
#This is an example of Reading data from Kusto. Here the query retrieves the max,min fares and distances that the taxi recorded every month from the years 2014 to 2020
kustoQuery = "GreenTaxiData |  where puYear between (2014 .. 2020 ) | summarize  MaxDistance=max(tripDistance) , MaxFare = max(fareAmount) ,MinDistance=min(tripDistance) , MinFare = min(fareAmount) by puYear,puMonth | order by puYear,puMonth desc"
kustoDf  = spark.read\
            .format("com.microsoft.kusto.spark.synapse.datasource")\
            .option("accessToken", trident_token_library_wrapper.PyTridentTokenLibrary.get_access_token(kustoUri))\
            .option("kustoCluster", kustoUri)\
            .option("kustoDatabase", database) \
            .option("kustoQuery", kustoQuery).load()

StatementMeta(, c31f16b3-4830-4e36-87c6-45a42cf958ea, 37, Finished, Available)

In [35]:
kustoDf.show()

StatementMeta(, c31f16b3-4830-4e36-87c6-45a42cf958ea, 38, Finished, Available)

+------+-------+-----------+--------+
|puYear|puMonth|MaxDistance| MaxFare|
+------+-------+-----------+--------+
|  2020|     11|        3.8|    12.5|
|  2019|      8|        0.0|     1.0|
|  2019|      7|       7.73|    25.5|
|  2019|      6|      202.1|  2109.0|
|  2019|      5|     117.23|  1540.5|
|  2019|      4|     101.53|  2438.5|
|  2019|      3|      603.8|  944.98|
|  2019|      2|      666.6|  4011.5|
|  2019|      1|     117.99|   400.0|
|  2018|     12|     105.13|   500.0|
|  2018|     11|     130.47|   584.5|
|  2018|     10|      87.13|   900.0|
|  2018|      9|    8005.68|  8011.5|
|  2018|      8|     151.13|  2872.0|
|  2018|      7|     159.01|  2113.0|
|  2018|      6|      143.1|  2703.0|
|  2018|      5|      621.1| 2126.69|
|  2018|      4|      482.8|10445.84|
|  2018|      3|       87.4|  1234.5|
|  2018|      2|     120.47|  2624.5|
+------+-------+-----------+--------+
only showing top 20 rows

