In [None]:
# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
blob_sas_token = "r"

# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

# SPARK read parquet, note that it won't load any data yet
df = spark.read.parquet(wasbs_path)
print('Register the DataFrame as a SQL temporary view: nyc_tlc')
df.createOrReplaceTempView('nyc_tlc')

# Enable cache to avoid repeated reads when querying full dataset
spark.conf.set("spark.databricks.io.cache.enabled", "true")

# Display top 10 rows
print('Displaying top 10 rows: ')
display(spark.sql('SELECT * FROM nyc_tlc LIMIT 10'))

In [None]:
# Display potential columns of interest, top 10 rows
sql = '''
SELECT 
    vendorID,
    paymentType,
    puYear,
    puMonth,
    passengerCount,
    fareAmount,
    improvementSurcharge,
    extra,
    mtaTax,
    tollsAmount,
    tipAmount,
    totalAmount
FROM nyc_tlc
LIMIT 10;
'''

# Display the result of the SQL query
display(spark.sql(sql))

In [None]:
# Calculate mean and median costs, prices, and passenger counts
# Aggregate by payment type, year, and month
sql = '''
SELECT
    CASE UPPER(paymentType) 
        WHEN 'CREDIT' || 'CRE' || '1' THEN 'CRD'
        WHEN 'CASH' || 'CAS' || '2' THEN 'CSH'
        WHEN 'NO CHARGE' || 'NO' || '3' THEN 'NOC'
        WHEN 'DISPUTE' || '4' THEN 'DIS'
        WHEN 'VOIDED TRIP' || '6' THEN 'VOID'
        ELSE 'UNKNOWN'
      END AS paymentType,
    puYear as Year,
    puMonth as Month,
    concat(string(puMonth),'/', string(puYear)) AS month_year,
    AVG(passengerCount) AS avg_passenger_count,
    MEDIAN(passengerCount) AS median_passenger_count,
    AVG(fareAmount) AS avg_fareAmount,
    MEDIAN(fareAmount) AS median_fareAmount,
    AVG(improvementSurcharge) AS avg_improvementSurcharge,
    MEDIAN(improvementSurcharge) median_improvementSurcharge,
    AVG(extra) AS avg_extra,
    MEDIAN(extra) AS median_extra,
    AVG(mtaTax) AS avg_mtaTax,
    MEDIAN(mtaTax) AS median_mtaTax,
    AVG(tollsAmount) AS avg_tollsAmount,
    MEDIAN(tollsAmount) AS median_tollsAmount,
    AVG(tipAmount) AS avg_tipAmount,
    MEDIAN(tipAmount) AS median_tipAmount,
    AVG(totalAmount) AS avg_totalAmount,
    MEDIAN(totalAmount) AS median_totalAmount
FROM nyc_tlc
WHERE puYear > 2008  -- few years of junk prior to 2009
GROUP BY 1,2,3
ORDER BY 3,2,1;
'''

# Display the result of the SQL query
display(spark.sql(sql))
