In [0]:
# Create multi column DataFrame from RDD
employees = sc.parallelize(
                            [
                                (1, "John", 10000),
                                (2, "Fred", 20000),
                                (3, "Anna", 30000),
                                (4, "James", 40000),
                                (5, "Mohit", 50000)
                            ]
                          )

employeesDF = employees.toDF()

In [0]:
employeesDF.show()

In [0]:
employeesDF = employeesDF.toDF("id", "name", "salary")

employeesDF.show()

In [0]:
display(employeesDF)

In [0]:
# Filter data

newdf = (
            employeesDF
                .where("salary > 20000")
                .where("ID = 4")
                
        )

display(newdf)

###Databricks Utilities

In [0]:
dbutils.help()

In [0]:
dbutils.fs.help()

###Accessing Data Lake with Access Key

####Step 1: Get Access Key of Data Lake
1. Go to Data Lake account
2. From left pane, go to Access Keys
3. Copy Key1

####Step 2: Replace values
1. In below 2 cells, replace the values of Data Lake name (twice), Access Key and Container Name

In [0]:
spark.conf.set(
    "fs.azure.account.key.***DataLakeName***.dfs.core.windows.net",
  
    "***Access Key***")

In [0]:
display(dbutils.fs.ls("abfss://***ContainerName***@***DataLakeName***.dfs.core.windows.net/Raw/"))

##Working with DataFrames

In [0]:
# Make sure to set the path correctly >> Use head command to verify if file is accessible

greenTaxisFilePath = "abfss://***ContainerName***@***DataLakeName***.dfs.core.windows.net/Raw/GreenTaxis_201911.csv"

In [0]:
# Change the path based on file location in your Data Lake

dbutils.fs.head(greenTaxisFilePath)

In [0]:
# Read csv file
greenTaxiDF = (
                  spark
                    .read                     
                    .csv(greenTaxisFilePath)
              )

display(greenTaxiDF)

In [0]:
# Read csv file by setting header as true
greenTaxiDF = (
                  spark
                    .read

                    .option("header", "true")
                    
                    .csv(greenTaxisFilePath)
              )

display(greenTaxiDF)

In [0]:
# Read csv file by setting header and inferring schema
greenTaxiDF = (
                  spark
                    .read

                    .option("header", "true")
                    .option("inferSchema", "true")
                    
                    .csv(greenTaxisFilePath)
              )

display(greenTaxiDF)

In [0]:
greenTaxiDF.printSchema()

##Applying Schemas

In [0]:
# Create schema for Green Taxi Data

from pyspark.sql.functions import *
from pyspark.sql.types import *
  
greenTaxiSchema = (
            StructType()               
               .add("VendorId", "integer")
               .add("lpep_pickup_datetime", "timestamp")
               .add("lpep_dropoff_datetime", "timestamp")
               .add("store_and_fwd_flag", "string")
               .add("RatecodeID", "integer")
               .add("PULocationID", "integer")
               .add("DOLocationID", "integer")
  
              .add("passenger_count", "integer")
              .add("trip_distance", "double")
              .add("fare_amount", "double")
              .add("extra", "double")
              .add("mta_tax", "double")
              .add("tip_amount", "double")
  
              .add("tolls_amount", "double")
              .add("ehail_fee", "double")
              .add("improvement_surcharge", "double")
              .add("total_amount", "double")
              .add("payment_type", "integer")
              .add("trip_type", "integer")
         )

In [0]:
# Read csv file by applying schema
greenTaxiDF = (
                  spark
                    .read

                    .option("header", "true")

                    .schema(greenTaxiSchema)
                    
                    .csv(greenTaxisFilePath)
              )

display(greenTaxiDF)

##Analyzing Data

In [0]:
display(
    greenTaxiDF.describe(
                             "passenger_count",                                     
                             "trip_distance"                                     
                        )
)

##Cleaning Raw Data

In [0]:
# Count before filtering
print("Before = " + str(greenTaxiDF.count()))

# Filter inaccurate data
greenTaxiDF = (
                  greenTaxiDF
                          .where("passenger_count > 0")
  
                          .filter(col("trip_distance") > 0.0)
)

# Count after filtering
print("After = " + str(greenTaxiDF.count()))

In [0]:
# Drop rows with nulls
greenTaxiDF = (
                  greenTaxiDF
                          .na.drop('all')
              )

In [0]:
# Map of default values
defaultValueMap = {'payment_type': 5, 'RateCodeID': 1}

# Replace nulls with default values
greenTaxiDF = (
                  greenTaxiDF
                      .na.fill(defaultValueMap)
              )

In [0]:
# Drop duplicate rows
greenTaxiDF = (
                  greenTaxiDF
                          .dropDuplicates()
              )

In [0]:
# Drop duplicate rows
greenTaxiDF = (
                  greenTaxiDF
                          .where("lpep_pickup_datetime >= '2019-11-01' AND lpep_dropoff_datetime < '2019-12-01'")
              )

In [0]:
# Display the final count

print("Final count after cleanup = " + str(greenTaxiDF.count()))

In [0]:
display(greenTaxiDF)

##Applying Transformations

In [0]:
greenTaxiDF = (
                  greenTaxiDF

                        # Select only limited columns
                        .select(
                                  col("VendorID"),
                                  col("passenger_count").alias("PassengerCount"),
                                  col("trip_distance").alias("TripDistance"),
                                  col("lpep_pickup_datetime").alias("PickupTime"),                          
                                  col("lpep_dropoff_datetime").alias("DropTime"), 
                                  col("PUlocationID").alias("PickupLocationId"), 
                                  col("DOlocationID").alias("DropLocationId"), 
                                  col("RatecodeID"), 
                                  col("total_amount").alias("TotalAmount"),
                                  col("payment_type").alias("PaymentType")
                               )
              )

greenTaxiDF.printSchema()

In [0]:
# Create a derived column - Trip time in minutes
greenTaxiDF = (
                  greenTaxiDF
                        .withColumn("TripTimeInMinutes", 
                                        round(
                                                (unix_timestamp(col("DropTime")) - unix_timestamp(col("PickupTime"))) 
                                                    / 60
                                             )
                                   )
              )

greenTaxiDF.printSchema()

In [0]:
# Create a derived column - Trip type
greenTaxiDF = (
                  greenTaxiDF
                        .withColumn("TripType", 
                                        when(
                                                col("RatecodeID") == 6,
                                                  "SharedTrip"
                                            )
                                        .otherwise("SoloTrip")
                                   )
              )

greenTaxiDF.printSchema()

In [0]:
# Create derived columns for year, month and day
greenTaxiDF = (
                  greenTaxiDF
                        .withColumn("TripYear", year(col("PickupTime")))
                        .withColumn("TripMonth", month(col("PickupTime")))
                        .withColumn("TripDay", dayofmonth(col("PickupTime")))
              )

greenTaxiDF.printSchema()

In [0]:
display(greenTaxiDF)

In [0]:
greenTaxiGroupedDF = (
                          greenTaxiDF
                            .groupBy("TripDay")
                            .agg(sum("TotalAmount").alias("total"))
  
                            .orderBy(col("TripDay").desc())
                     )
    
display(greenTaxiGroupedDF)

##Joining with another dataset

In [0]:
# Make sure to set the path correctly >> Use head command to verify if file is accessible

taxiZonesFilePath = "abfss://***ContainerName***@***DataLakeName***.dfs.core.windows.net/Raw/TaxiZones.csv"

In [0]:
dbutils.fs.head(taxiZonesFilePath)

In [0]:
# Read TaxiZones file
taxiZonesDF = (
                  spark
                      .read
                      .option("header", "true")
                      .option("inferSchema", "true")
                      .csv(taxiZonesFilePath)
              )

display(taxiZonesDF)

In [0]:
greenTaxiWithZonesDF = (
                          greenTaxiDF.alias("g")
                                     .join(taxiZonesDF.alias("t"),                                               
                                               col("t.LocationId") == col("g.PickupLocationId"),
                                              "inner"
                                          )
                       )

display(greenTaxiWithZonesDF)

###Exercise

In [0]:
# EXERCISE - JOIN greenTaxiWithZonesDF with TaxiZones on DropLocationId. And group by PickupZone and DropZone, and provide average of TotalAmount.

##Working with Spark SQL

In [0]:
# Create a local temp view
greenTaxiDF.createOrReplaceTempView("GreenTaxiTripData")

In [0]:
display(
  spark.sql("SELECT PassengerCount, PickupTime FROM GreenTaxiTripData WHERE PickupLocationID = 1")
)

In [0]:
%sql

SELECT PassengerCount, PickupTime 
FROM GreenTaxiTripData 
WHERE PickupLocationID = 1

##Writing Output to Data Lake

In [0]:
#Define the path

outputFilePath = "abfss://***ContainerName***@***DataLakeName***.dfs.core.windows.net/Output/GreenTaxis"

In [0]:
# Write output as CSV File
(
    greenTaxiDF   
        .write
        .option("header", "true")
        .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
        .mode("overwrite")
        .csv(outputFilePath + ".csv")
)

In [0]:
# Load the dataframe as parquet to storage
(
    greenTaxiDF  
      .write
      .option("header", "true")
      .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
      .mode("overwrite")
      .parquet(outputFilePath + ".parquet")
)

###Reading JSON

In [0]:
# Make sure to set the path correctly >> Use head command to verify if file is accessible

paymentTypesFilePath = "abfss://***ContainerName***@***DataLakeName***.dfs.core.windows.net/Raw/PaymentTypes.json"

In [0]:
paymentTypes = (
                    spark
                        .read
                        .json(paymentTypesFilePath)
)

display(paymentTypes)

###Working with Spark SQL and Tables

In [0]:
%sql

CREATE DATABASE IF NOT EXISTS MyDatabase

In [0]:
# Store data as a Managed Table
(
  greenTaxiDF
    .write
    .mode("overwrite")
    .saveAsTable("MyDatabase.GreenTaxis")
)

In [0]:
%sql

SELECT *
FROM MyDatabase.GreenTaxis
LIMIT 10

In [0]:
%sql

DESCRIBE EXTENDED MyDatabase.GreenTaxis

In [0]:
%sql

DROP TABLE MyDatabase.GreenTaxis

In [0]:
# Store data as an Unmanaged Table
(
  greenTaxiDF
    .write
    .mode("overwrite")
  
    .option("path", outputFilePath + "1.parquet")
    #.format("csv")   /* Default format is parquet */
  
    .saveAsTable("MyDatabase.GreenTaxis") 
)

In [0]:
%sql

DESCRIBE EXTENDED MyDatabase.GreenTaxis

In [0]:
%sql

DROP TABLE MyDatabase.GreenTaxis