diff --git a/avro_sample.py b/avro_sample.py index 32d4f27..3c69199 100644 --- a/avro_sample.py +++ b/avro_sample.py @@ -1,5 +1,7 @@ # Databricks notebook source +from pyspark.sql.types import * +from pyspark.sql.functions import * dbutils.widgets.text("year", "2019", "Year") dbutils.widgets.text("month", "03", "Month") dbutils.widgets.text("day", "01", "Day") @@ -12,8 +14,6 @@ # COMMAND ---------- -from pyspark.sql.functions import * -from pyspark.sql.types import * # COMMAND ---------- @@ -22,7 +22,8 @@ # COMMAND ---------- # Path format: partition/year/month/day/hour/minute/ -input_loc = "wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/ehnrtanalytics/ehnrtanalytics-output/*/{}/{}/{}/*/*/".format(y, m, d) +input_loc = "wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/ehnrtanalytics/ehnrtanalytics-output/*/{}/{}/{}/*/*/".format( + y, m, d) # COMMAND ---------- @@ -47,19 +48,23 @@ # COMMAND ---------- -df2.write.json('wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/temp_out/', mode="overwrite") +df2.write.json( + 'wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/temp_out/', mode="overwrite") # COMMAND ---------- -dbutils.fs.ls("wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/temp_out/") +dbutils.fs.ls( + "wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/temp_out/") # COMMAND ---------- -dfjs = spark.read.json("wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/temp_out/") +dfjs = spark.read.json( + "wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/temp_out/") # COMMAND ---------- -schema = StructType([StructField("version", StringType()), StructField("userid", StringType()), StructField("platform", StringType())]) +schema = StructType([StructField("version", StringType()), StructField( + "userid", StringType()), StructField("platform", StringType())]) dfx = dfjs.select(from_json("Body", schema).alias("B")) # COMMAND ---------- @@ -80,9 +85,8 @@ # COMMAND ---------- -out_loc = "wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/csv_out/{}/{}/{}".format(year, month, day) +out_loc = "wasbs://ehcapture-analytics@storagenrtanalytics.blob.core.windows.net/csv_out/{}/{}/{}".format( + year, month, day) dfx2.write.csv(out_loc, header=True, mode="overwrite") # COMMAND ---------- - -