In [0]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark=SparkSession.builder.appName("Spark DataFrames").getOrCreate()

#define the schema
orderschema=StructType([StructField("orderID", IntegerType(), True), 
                        StructField("customerID", StringType(), True), 
                        StructField("employeeID", IntegerType(), True),
                        StructField("orderDate", TimestampType(), True),
                        StructField("requiredDate", TimestampType(), True),
                        StructField("shippedDate", StringType(), True),
                        StructField("shipVia", IntegerType(), True),
                        StructField("freight", DoubleType(), True),
                        StructField("shipName", StringType(), True),
                        StructField("shipAddress", StringType(), True),
                        StructField("shipCity", StringType(), True),
                        StructField("shipRegion", StringType(), True),
                        StructField("shipPostalCode", StringType(), True),
                        StructField("shipCountry", StringType(), True)
                        ])
df_order=spark.read.csv("/Volumes/dbckwork/aibiwork/datafiles/csv/orders.csv", header=True, schema=orderschema)
#display(df_order.limit(5))

#perform the data validations
df_order_transform=df_order.select(col("orderID"), 
                                   col("customerID"), 
                                   col("employeeID"), 
                                   col("orderDate").try_cast("date").alias("orderDate"),
                                   col("requiredDate").try_cast("date").alias("requiredDate"), 
                                   col("shippedDate").try_cast("date").alias("shippedDate"), 
                                   col("shipVia"), 
                                   col("freight").alias("freight"), 
                                   col("shipName"), 
                                   when (col("shipCountry").rlike("[0-9]"), concat(col("shipAddress"),col("shipcity")))
                                   .otherwise (col("shipAddress")).alias("shipAddress_T"), 
                                   when (col("shipCountry").rlike("[0-9]"),col("shipRegion")) 
                                   .otherwise (col("shipCity")).alias("shipCity_t"), 
                                   when (col("shipCountry").rlike("[0-9]"),col("shipPostalCode")) 
                                   .otherwise (col("shipRegion")).alias("shipRegion_t"), 
                                   regexp_replace(regexp_replace((when (col("shipCountry").rlike("[0-9]"),col("shipCountry"))
                                   .otherwise(col("shipPostalCode"))),r"\-","")," ","").alias("shipPostalCode_t"), 
                                   when (col("shipCountry").rlike("[0-9]"),"Brazil")
                                   .otherwise(col("shipCountry")).alias("shipCountry_t")
                                   )
display(df_order_transform)

#saving output to table

#check for groups Aggregations by Country
#df_grp=df_order_transform.groupBy("shipcountry_t").agg(count("*").alias("count"),
#                                 max("freight").alias("max_freight"),
#                                 min("freight").alias("min_freight"),
#                                 avg("freight").alias("avg_freight"),
#                                 sum("freight").alias("sum_freight")
#                                 ).orderBy("count",ascending=False)
#display(df_grp)
                                   

Databricks data profile. Run in Databricks to view.

In [0]:
#save the dataframe output to table
df_order_transform.select(col("orderID"), 
                                   col("customerID"), 
                                   col("employeeID"), 
                                   col("orderDate"), 
                                   col("requiredDate"), 
                                   col("shippedDate"), 
                                   col("shipVia"), 
                                   col("freight"), 
                                   col("shipName"),
                                   col("shipAddress_T").alias("shipAddress"), 
                                   col("shipCity_t").alias("shipCity"), 
                                   col("shipRegion_t").alias("shipRegion"), 
                                   col("shipPostalCode_t").alias("shipPostalCode"), 
                                   col("shipCountry_t").alias("shipCountry")
                                   )\
                .write.mode("overwrite").saveAsTable("dbckwork.dwh_sql.orders")