# Reading Avro Blobs Into Parquet Data Sets

In [7]:
import os
import string
import json
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession, SQLContext, Window
from pyspark.sql.functions import udf, mean, lit, stddev, col, expr, when, date_sub, avg, window
from pyspark.sql.types import DoubleType, ArrayType, ShortType, LongType, IntegerType, TimestampType, StructType, StringType, StructField
from pyspark.storagelevel import StorageLevel
from azureml.api.schema.dataTypes import DataTypes
from azureml.api.schema.sampleDefinition import SampleDefinition
from azureml.api.realtime.services import generate_schema

STORAGE_ACCOUNT_SUFFIX = 'core.windows.net'
TELEMETRY_STORAGE_ACCOUNT_NAME = "stguwsjc4vojbrbw"
TELEMETRY_STORAGE_ACCOUNT_KEY = "2GGDWVBtGBy+hAgl5a1uGT4NeU2zzFdocuDFKnOwR2vc5wEOP7jTfbS3/Nl5vvzEudJ4nfH6ozmSOSPXo3xETA=="
TELEMETRY_CONTAINER_NAME = "telemetry"
LOGS_ARCHIVE_CONTAINER_NAME = 'logs-archive'
STAGING_STORAGE_ACCOUNT_NAME = "stguwsjc4vojbrbw"
STAGING_STORAGE_ACCOUNT_KEY = "2GGDWVBtGBy+hAgl5a1uGT4NeU2zzFdocuDFKnOwR2vc5wEOP7jTfbS3/Nl5vvzEudJ4nfH6ozmSOSPXo3xETA=="


In [8]:
from pathlib import Path
data_dir = str(Path.home()) + '/data'

% rm -rf $data_dir
% mkdir $data_dir $data_dir/logs $data_dir/telemetry

In [9]:
wasbTelemetryUrl = "wasb://{0}@{1}.blob.{2}/*/*/*/*/*/*/*".format(TELEMETRY_CONTAINER_NAME, 
                                                                  TELEMETRY_STORAGE_ACCOUNT_NAME, 
                                                                  STORAGE_ACCOUNT_SUFFIX)
sc = SparkSession.builder.getOrCreate()
hc = sc._jsc.hadoopConfiguration()
hc.set("avro.mapred.ignore.inputs.without.extension", "false")
if TELEMETRY_STORAGE_ACCOUNT_KEY:
     hc.set("fs.azure.account.key.{}.blob.core.windows.net".format(TELEMETRY_STORAGE_ACCOUNT_NAME), TELEMETRY_STORAGE_ACCOUNT_KEY)
hc.set("fs.azure.account.key.{}.blob.core.windows.net"
    .format(STAGING_STORAGE_ACCOUNT_NAME), STAGING_STORAGE_ACCOUNT_KEY)
sql = SQLContext.getOrCreate(sc)
avroblob = sql.read.format("com.databricks.spark.avro").load(wasbTelemetryUrl)
avroblob.show()

+--------------------+----------+--------------------+--------------------+
|     EnqueuedTimeUtc|Properties|    SystemProperties|                Body|
+--------------------+----------+--------------------+--------------------+
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T16:41:...|     Map()|Map(connectionAut...|[7B 22 6D 61 63 6...|
|2018-06-20T

In [10]:
#Convert byteformat to string format in pyspark dataframe
from json import loads as Loads
column = avroblob['Body']
string_udf = udf(lambda x: x.decode("utf-8"))
avroblob=avroblob.withColumn("BodyString", string_udf(column))
avroblob.printSchema()

root
 |-- EnqueuedTimeUtc: string (nullable = true)
 |-- Properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- SystemProperties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- Body: binary (nullable = true)
 |-- BodyString: string (nullable = true)



In [11]:
json_udf = udf(lambda x: Loads(x))
avroblob=avroblob.withColumn("BodyJson", json_udf(avroblob["BodyString"]))
avroblob.head()

Row(EnqueuedTimeUtc='2018-06-20T16:41:39.0980000Z', Properties={}, SystemProperties={'connectionDeviceGenerationId': '636650280485038545', 'connectionDeviceId': 'Machine-004', 'connectionAuthMethod': '{"scope":"device","type":"sas","issuer":"iothub","acceptingIpFilterRule":null}', 'enqueuedTime': '2018-06-20T16:41:39.0980000Z'}, Body=bytearray(b'{"machineID": "Machine-004", "timestamp": "2018-06-20T16:41:39.086812", "speed_desired": 1000, "ambient_temperature": 20.06, "ambient_pressure": 100.92, "speed": 1133.27, "temperature": 174.62, "pressure": 655.58, "vibration": null}'), BodyString='{"machineID": "Machine-004", "timestamp": "2018-06-20T16:41:39.086812", "speed_desired": 1000, "ambient_temperature": 20.06, "ambient_pressure": 100.92, "speed": 1133.27, "temperature": 174.62, "pressure": 655.58, "vibration": null}', BodyJson='{speed_desired=1000, ambient_temperature=20.06, machineID=Machine-004, ambient_pressure=100.92, temperature=174.62, pressure=655.58, vibration=null, speed=1133

In [None]:
input_columns = [
    ("ambient_temperature", DoubleType()),
    ("ambient_pressure", DoubleType()),
    ("speed", DoubleType()),
    ("temperature", DoubleType()),
    ("pressure", DoubleType()),
    ("vibration", ArrayType(ShortType()))]

def extract_value(c):
    """Returns a PySpark UDF that un-pickles
    the payload, then extracts a value by its key
    performing a type conversion if needed.
    """
    def f(x):
        v = pickle.loads(x)[c[0]]
        if isinstance(v, np.ndarray):
            v = v.tolist()
        if isinstance(c[1], DoubleType):            
            v = float(v)
        return v
    return udf(f, c[1])

telemetry_df = (telemetry_df
      .withColumn('machineID', telemetry_df.SystemProperties['connectionDeviceId'])
      .withColumn("timestamp", telemetry_df['EnqueuedTimeUtc'].cast(TimestampType())))

# Note: The reduce function provides a concise way of adding many columns to the dataframe at once.
# We will be using this pattern a couple of times throughout this notebook.
telemetry_df = (reduce(lambda _df, ic: _df.withColumn(ic[0], extract_value(ic)(telemetry_df.Body)), input_columns, telemetry_df)
      .drop('Properties', 'SystemProperties', 'Body', 'EnqueuedTimeUtc'))

In [160]:
telemetry_df.write.parquet('{0}/telemetry/telemetry.parquet'.format(data_dir), mode='overwrite')

AttributeError: 'DataFrame' object has no attribute 'write'