In [None]:
import azureml
from azureml.core import Run
from azureml.core import Workspace
from azureml.core.model import Model
from azureml.core.run import Run
from azureml.core.experiment import Experiment

import scipy

# Verify versions of key libraries
# view version history at https://pypi.org/project/azureml-sdk/#history 
print("Azure ML SDK Version:", azureml.core.VERSION)
print("SciPy Version: ", scipy.__version__)

# Configure access to the Azure Machine Learning resources

## Configure Service Principal authentication following the instructions here: [Setup Service Principal Authentication](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?view=azure-ml-py#set-up-service-principal-authentication).

Use the JSON output from the commands in the above link to retrieve the values needed for `tenant_id`, `service_principal_id`, and `service_principal_password` in the next cell.

Note: if the Azure account you are using has access to multiple Azure subscriptions, **make sure you run CLI commands in the correct Azure subscription**. You can set the default subscription to the one you are using for the lab/demo with the Azure CLI command `az account set`.

Reference: https://docs.microsoft.com/cli/azure/account#az-account-set

# Variables

Provide values for the following variables which will be used throughout the rest of this notebook.

In [None]:
# Provide the Subscription ID of the Azure subscription you are using for the lab/demo
subscription_id = ""

# Resource Group name where your lab/demo resources are deployed
resource_group = ""

# Azure Machine Learning Workspace name and Azure region
# Get these from the Azure ML workspace Overview in your Resource Group
workspace_name = ""
workspace_region = "East US"

# Values from `Setup Service Principal Authentication` in the above cell
# For reference, SP name you created (not needed in a variable): pz-ml-auth
tenant_id = "" # Use "tenantId" value
service_principal_id = "" # Use "clientId" value
service_principal_password = "" # Use "clientSecret" value

# Pre-trained ML model
# Update for final release
# pkl_url = "https://github.com/AzureCosmosDB/scenario-based-labs/blob/master/IoT/deploy/modelv3.pkl?raw=true"
pkl_url = "https://github.com/plzm/scenario-based-labs/blob/iot-2020/IoT/deploy/modelv3.pkl?raw=true"
local_folder = "models"
local_path = local_folder+"/modelv3.pkl"
model_name = "batt-cycles-7"

# Cosmos DB
cosmos_db_region = workspace_region
cosmos_db_database = "ContosoAuto"
cosmos_db_container_metadata = "metadata"
cosmos_db_container_maintenance = "maintenance"

synapse_cosmos_db_linked_service = "CosmosDbIoTLab"


# Batch Scoring data
In this notebook, you will use a forecasting model to determine if the battery will need replacement within the next 30 days.

In [None]:
from azureml.core.authentication import ServicePrincipalAuthentication

sp = ServicePrincipalAuthentication(
    tenant_id=tenant_id,
    service_principal_id=service_principal_id,
    service_principal_password=service_principal_password)

In [None]:
# By using the exist_ok param, if the workspace already exists we get a reference to the existing workspace
from azureml.core import Workspace

ws = Workspace.get(
    name=workspace_name, 
    auth=sp,
    subscription_id=subscription_id)

ws.get_details()

## Retrieve the pre-trained model
A pre-trained model has been made available in a public Azure Storage account. Run the following cell to download the model and then register it as a model within your Azure Machine Learning workspace.

In [None]:
import os
import urllib.request
from azureml.core import Model

print("Downloading the pre-trained model...")
os.makedirs("models", exist_ok=True)

urllib.request.urlretrieve(pkl_url, local_path)

print("Download complete.")

print("Uploading and registering model...")
registered_model = Model.register(
    model_path=local_path, 
    model_name=model_name, 
    workspace=ws)

Run the following to retrieve the model from your Azure Machine Learning workspace, and inspect some of its properties.

In [None]:
from azureml.core.model import Model
from sklearn.externals import joblib
from azureml.train import automl

model_path = Model.get_model_path(model_name=model_name, _workspace=ws)
print("Model saved to ", model_path)
model = joblib.load(model_path)
print("Model loaded.")

## Load the data from Cosmos DB to batch score it
Run the following cells to query Cosmos DB Analytical store, prepare the data using SQL queries and then surface the data as temporary views.

### Register Temp View
Now we register the view required to create the dataset that will be used to make the predictions. Notice how you are now capable to join data from multiple Cosmos DB containers.


In [None]:
# vehicle_metadata_df = spark.read.cosmos_olap('metadata').createOrReplaceTempView("metadata")

vehicle_metadata_df = spark.read\
    .format("cosmos.olap")\
    .option("spark.synapse.linkedService", synapse_cosmos_db_linked_service)\
    .option("spark.cosmos.container", cosmos_db_container_metadata)\
    .load()

In [None]:
print(vehicle_metadata_df.count())

vehicle_metadata_df.printSchema()

In [None]:
vehicle_metadata_df.createOrReplaceTempView("metadata")

### Generate Scoring dataset
Now we are ready to use the previously created view to generate the final dataset

In [None]:
trips_clean = spark.sql("""
    SELECT  vin, 
            to_utc_timestamp(tripEnded, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\") as tripEnded, 
            to_utc_timestamp(tripStarted, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\") as tripStarted, 
            ((unix_timestamp(to_utc_timestamp(tripEnded, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\")) - 
                unix_timestamp(to_utc_timestamp(tripStarted, \"yyyy-MM-dd'T'HH:mm:ss.SSSX'Z'\")))/60.0) as tripDurationMinutes
    FROM metadata
    WHERE entityType = 'Trip' AND status = 'Completed'
    """)

trips_clean.createOrReplaceTempView("trips_clean")
print(trips_clean.count())
trips_clean.printSchema()

In [None]:
vehicles_raw = spark.sql("""
    SELECT vin, batteryAgeDays, batteryRatedCycles, lifetimeBatteryCyclesUsed 
    FROM metadata 
    WHERE entityType ='Vehicle'
    """)

vehicles_raw.createOrReplaceTempView("vehicles_raw")
print(vehicles_raw.count())
vehicles_raw.printSchema()

In [None]:
vehicles_batch = spark.sql("""
    SELECT  v.vin as vin, 
            to_date(t.tripEnded, 'yyyy-MM-dd') as tripEnded, 
            t.tripDurationMinutes, 
            v.batteryAgeDays, 
            v.batteryRatedCycles, 
            v.lifetimeBatteryCyclesUsed 
    FROM    vehicles_raw v 
    INNER JOIN trips_clean t 
        ON v.vin = t.vin
    """)

vehicles_batch.createOrReplaceTempView("vehicles_batch")
print(vehicles_batch.count())
vehicles_batch.printSchema()

In [None]:
vehicles_batch.show()

Run the following cells to convert the Spark DataFrame to a Pandas DataFrame for use with the pre-created model.

In [None]:
import pandas as pd

spark_df = spark.sql("\
    SELECT\
        vin,\
        cast(tripEnded as string) as date,\
        tripDurationMinutes as daily_Trip_Duration,\
        batteryAgeDays as battery_Age_Days,\
        batteryRatedCycles,\
        lifetimeBatteryCyclesUsed\
    FROM vehicles_batch v")
pd_df = spark_df.toPandas()
pd_df['date'] = pd.to_datetime(pd_df['date']) # Added to address Spark Date to Pandas date conversion

## Define the scoring logic
The following cell will apply the model and return a prediction for whether or not maintenance is required.

Run the following cell to define the helper method.

In [None]:
def predict_maintenance(row):
    # from azureml.train import automl
    from sklearn.linear_model import LinearRegression
    import pandas as pd
    import numpy as np
    from datetime import datetime

    predict_needs_service = 0

    startday = row["battery_Age_Days"]
    dailytripduration = row["daily_Trip_Duration"]
    current_cycles = row["lifetimeBatteryCyclesUsed"]
    rated_lifetime_cycles = row["batteryRatedCycles"]

    # Simple arithmetic approach if we do not have specific, variable daily trip duration
    cycles_per_day = current_cycles / startday
    print(cycles_per_day)
    total_cycles_in_30_days = current_cycles + (30 * cycles_per_day)
    
    if total_cycles_in_30_days > rated_lifetime_cycles:
        predict_needs_service = 1

    print(predict_needs_service)
    # dayslist = range(startday, startday + 30)

    # pds_df = pd.DataFrame({'battery_Age_Days': dayslist, 'daily_Trip_Duration': dailytripduration})

    # reg = LinearRegression().fit(dayslist, pds_df)

    # y_Pred = reg.predict(np.array(pds_df))
    # total_cycles_next_30_days = y_Pred[[29,]][0][0]

    # if current_cycles + total_cycles_next_30_days > rated_lifetime_cycles:
    #     predict_needs_service = 1

    return predict_needs_service

In [None]:
# Calculate the predictions

predictions = pd_df.apply(predict_maintenance, axis=1)

Now, run the following cell to examine the prediction by `VIN`

In [None]:
import pandas as pd
batch_predictions_pdf = pd.DataFrame({"vin": pd_df["vin"], "serviceRequired":predictions})

In [None]:
batch_predictions_pdf

## Write the predictions back to Cosmos DB
Now you will save the previously created predictions DataFrame back to the `maintenance` collection in Cosmos DB.

Run the following cells to do so.

In [None]:
# Retrieve connection string and key from LinkService
import sys
import re

from pyspark.sql import SparkSession
sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary

connection_string = token_library.getConnectionString(synapse_cosmos_db_linked_service)
matchObj = re.match( r'AccountEndpoint=(.*);Database=(.*);AccountKey="(.*)";', connection_string, re.M|re.I)
endpoint = matchObj.group(1)
masterkey = matchObj.group(3)

In [None]:
# The Spark dataframe will be created even though this may throw an error about attempted Arrow optimization
# As of 03-Aug-2020 open issue at Apache: https://issues.apache.org/jira/browse/SPARK-30966

batch_predictions = spark.createDataFrame(batch_predictions_pdf)

In [None]:
batch_predictions.show()

In [None]:
write_config_maintenance = {
    "Endpoint": endpoint,
    "Masterkey": masterkey,
    "Database": cosmos_db_database,
    "Collection": cosmos_db_container_maintenance,
    "Upsert": "true"
}

batch_predictions.write.mode("overwrite").format("com.microsoft.azure.cosmosdb.spark").options(**write_config_maintenance).save()

In [None]:
read_config_maintenance = {
    "Endpoint" : endpoint,
    "Masterkey" : masterkey,
    "Database" : cosmos_db_database,
    "Collection" : cosmos_db_container_maintenance
}

maint = spark.read.format("com.microsoft.azure.cosmosdb.spark").options(**read_config_maintenance).load()

maint.createOrReplaceTempView("maintenance")

In [None]:
maint.show()