### Deng AI Azure Models and Training
#### Environment and Data Prep for this Model

In [1]:
#Load the workspace from the config file
from azureml.core import Workspace

ws = Workspace.from_config(path='.azureml/ws_config.json')
print(ws.name, "loaded")

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Azure-ML-WS loaded


In [105]:
#set up variable to contain input data folder
inputdata_folder='inputdata'

In [106]:
%%writefile $inputdata_folder/create_rfr_datasets.py

import pandas as pd
from sklearn.preprocessing import RobustScaler
from azureml.core import Workspace,Datastore,Dataset,Run
import argparse
import os

#Set run context and workspace
run=Run.get_context()
ws=run.experiment.workspace
default_ds = ws.get_default_datastore()

# Get PipelineData argument
parser = argparse.ArgumentParser()
parser.add_argument('--folder', type=str, dest='folder')
args = parser.parse_args()
output_folder = args.folder

#Define the fields used for each city
sj_features=[
    'year',
    'yearcount',
    'weekofyear',
    'station_max_temp_c',
    'station_min_temp_c',
    'cum_rain_prior_24_wks',
    'avg_max_temp_prior_22_wks',
    'total_cases'
]

sj_lags={
    'year':0,
    'yearcount':0,
    'weekofyear':0,
    'station_max_temp_c':0,
    'station_min_temp_c':0,
    'cum_rain_prior_24_wks':46,
    'avg_max_temp_prior_22_wks':0,
    'total_cases':0
}

iq_features=[
    'year',
    'yearcount',
    'weekofyear',
    'reanalysis_min_air_temp_k',
    'station_max_temp_c',
    'cum_rain_prior_22_wks',
    'total_cases'
]

iq_lags={
    'year':0,
    'yearcount':0,
    'weekofyear':0,
    'reanalysis_min_air_temp_k':0,
    'station_max_temp_c':0,
    'cum_rain_prior_22_wks':43,
    'total_cases':0
}

#Define a function to retrieve the features to be used in the model for each specific city
def get_feature_list(city,lag_names=True):
    if city=='sj':
        feature_list=[]
        if lag_names==True:
            feature_list=sj_features
            for key, value in sj_lags.items():
                for i in range(value): feature_list.append(str(key)+'_shift_'+str(i))
        else:
            for key, value in sj_lags.items(): feature_list.append(str(key))
    elif city=='iq':
        feature_list=[]
        if lag_names==True:
            feature_list=iq_features
            for key, value in iq_lags.items():
                for i in range(value): feature_list.append(str(key)+'_shift_'+str(i))
        else:
            for key, value in iq_lags.items(): feature_list.append(str(key))
                
    return feature_list

#Define a function to create a set of time-lagged features based on the feature and the desired lag
def create_lag_features(df,lag,end_col=0):
    for i in range(lag):
        df_lag=df.iloc[:,:end_col]
        df_lag=df_lag.shift(periods=i)
        df=df.join(df_lag,rsuffix='_shift_'+str(i))
    
    df=df.iloc[lag:,:]
    df.reset_index(inplace=True,drop=True)
    
    return df

#create sets for each city
def prep_for_model(city,lookback):
    #get train and test for sj or iq
    if city=='sj':
        #df=pd.read_csv('inputdata/train_all_sj.csv')
        #df_h=pd.read_csv('inputdata/holdout_all_sj.csv')
        train_all_sj_ds = ws.datasets.get('dengue-train-all-sj-ds')
        holdout_all_sj_ds = ws.datasets.get('dengue-holdout-all-sj-ds')
        df=train_all_sj_ds.to_pandas_dataframe()
        df_h=holdout_all_sj_ds.to_pandas_dataframe()
        df_h['total_cases']=0
    elif city=='iq':
        #df=pd.read_csv('inputdata/train_all_iq.csv')
        #df_h=pd.read_csv('inputdata/holdout_all_iq.csv')
        train_all_iq_ds = ws.datasets.get('dengue-train-all-iq-ds')
        holdout_all_iq_ds = ws.datasets.get('dengue-holdout-all-iq-ds')
        df=train_all_iq_ds.to_pandas_dataframe()
        df_h=holdout_all_iq_ds.to_pandas_dataframe()
        df_h['total_cases']=0
    
    #create single dataset
    df_all=df.append(df_h,ignore_index=True)

    #Get the lists of features to train and reduce the df to those
    training_feature_list=[]
    city_feature_list=get_feature_list(city,lag_names=False)
    for i in range(len(city_feature_list)):training_feature_list.append(city_feature_list[i])
    df_all_lag=df_all[training_feature_list].copy()

    #Create lagged data
    df_all_lag=create_lag_features(df_all_lag,lag=lookback,end_col=df_all_lag.shape[1])

    #Reduce features to just the ones needed for training plus the lagged versions of the features since we need 2d dataset
    training_feature_list=[]
    city_feature_list=get_feature_list(city,lag_names=True)
    for i in range(len(city_feature_list)):training_feature_list.append(city_feature_list[i])
    df_all_lag=df_all_lag[training_feature_list].copy()

    #Break out the label data so it does not get scaled and the drop the values for holdout since they are all 0
    y=df_all_lag['total_cases']
    y=y[:df.shape[0]]
    df_all_lag.drop(columns=['total_cases'],inplace=True)

    #scale features using desired scaler
    scaler=RobustScaler()
    df_all_lag=scaler.fit_transform(df_all_lag)

    #break out the holdout file from the input file
    np_df=df_all_lag[:df.shape[0],:]
    np_df_h=df_all_lag[df.shape[0]:,:]

    return np_df, np_df_h, y

#Create the datasets for each city and save to intermediate data file for model use
np_sj,np_sj_h,y_sj=prep_for_model(city='sj',lookback=50)
df_sj=pd.DataFrame(np_sj)
df_sj_holdout=pd.DataFrame(np_sj_h)
df_y_sj=pd.DataFrame(y_sj)

# Save prepped data to the PipelineData location for sj
os.makedirs(output_folder, exist_ok=True)
train_sj_output_path = os.path.join(output_folder, 'train_sj_scaled.csv')
df_sj.to_csv(train_sj_output_path,index=False)

test_sj_output_path = os.path.join(output_folder, 'holdout_sj_scaled.csv')
df_sj_holdout.to_csv(test_sj_output_path,index=False)

y_sj_output_path = os.path.join(output_folder, 'y_sj.csv')
df_y_sj.to_csv(y_sj_output_path,index=False)

#Create the datasets for each city and save to intermediate data file for model use
np_iq,np_iq_h,y_iq=prep_for_model(city='iq',lookback=50)
df_iq=pd.DataFrame(np_iq)
df_iq_holdout=pd.DataFrame(np_iq_h)
df_y_iq=pd.DataFrame(y_iq)

# Save prepped data to the PipelineData location for iq
train_iq_output_path = os.path.join(output_folder, 'train_iq_scaled.csv')
df_iq.to_csv(train_iq_output_path,index=False)

test_iq_output_path = os.path.join(output_folder, 'holdout_iq_scaled.csv')
df_iq_holdout.to_csv(test_iq_output_path,index=False)

y_iq_output_path = os.path.join(output_folder, 'y_iq.csv')
df_y_iq.to_csv(y_iq_output_path,index=False)

run.complete

Writing inputdata/create_rfr_datasets.py


#### Create and Register the Model
##### Model for SJ

In [152]:
%%writefile $inputdata_folder/rfr_train_sj.py
#Import libraries
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

#Set run context
run=Run.get_context()

# Get PipelineData argument
parser = argparse.ArgumentParser()
parser.add_argument('--folder', type=str, dest='folder')
parser.add_argument('--model_folder',type=str,dest='model_folder')
args = parser.parse_args()
data_folder = args.folder
model_folder= args.model_folder

#create a dataframe for each dataset, train and holdout
df_sj=pd.read_csv(data_folder+'/train_sj_scaled.csv')
df_sj_h=pd.read_csv(data_folder+'/holdout_sj_scaled.csv')
df_sj_y=pd.read_csv(data_folder+'/y_sj.csv')

#get the datasets for the city
#np_sj,np_sj_h,y_sj=prep_for_model(city='sj',lookback=50)

#split the training set into train and test
x_train, x_test, y_train, y_test = train_test_split(df_sj, df_sj_y, test_size=0.30, random_state=0)

#create the model
rfr=RandomForestRegressor(n_estimators=300,max_depth=10)
rfr.fit(x_train,y_train)

#score the model
score=rfr.score(x_test,y_test)
print('SJ score: ',score)
run.log('SJ score: ',np.float(score))

#calculate MAE
y_hat=rfr.predict(x_test)
mae=mean_absolute_error(y_hat,y_test)
print('SJ MAE: ',mae)
run.log('SJ MAE: ',np.float(mae))

# Save the trained model
os.makedirs(model_folder, exist_ok=True)
output_path = model_folder + "/sj_rfr_model.pkl"
joblib.dump(value=rfr, filename=output_path)

run.complete()


Overwriting inputdata/rfr_train_sj.py


In [153]:
%%writefile $inputdata_folder/register_rfr_sj.py
# Import libraries
import argparse
import joblib
from azureml.core import Workspace, Model, Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--model_folder', type=str, dest='model_folder')
args = parser.parse_args()
model_folder = args.model_folder
print('Model folder',str(model_folder))

# Get the experiment run context
run = Run.get_context()

# load the model
print('Loading model from '' + model_folder)
model_file = model_folder + '/sj_rfr_model.pkl'
model = joblib.load(model_file)

Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'sj_rfr_model',
               tags={'Training context':'Pipeline'})

run.complete()

Overwriting inputdata/register_rfr_sj.py


##### Model for IQ

In [156]:
%%writefile $inputdata_folder/rfr_train_iq.py
#Import libraries
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

#Set run context
run=Run.get_context()

# Get PipelineData argument
parser = argparse.ArgumentParser()
parser.add_argument('--folder', type=str, dest='folder')
parser.add_argument('--model_folder',type=str,dest='model_folder')
args = parser.parse_args()
data_folder = args.folder
model_folder= args.model_folder

#create a dataframe for each dataset, train and holdout
df_iq=pd.read_csv(data_folder+'/train_iq_scaled.csv')
df_iq_h=pd.read_csv(data_folder+'/holdout_iq_scaled.csv')
df_iq_y=pd.read_csv(data_folder+'/y_iq.csv')

#split the training set into train and test
x_train, x_test, y_train, y_test = train_test_split(df_iq, df_iq_y, test_size=0.30, random_state=0)

#create the model
rfr=RandomForestRegressor(n_estimators=300,max_depth=10)
rfr.fit(x_train,y_train)

#score the model
score=rfr.score(x_test,y_test)
print('IQ score: ',score)
run.log('IQ score: ',np.float(score))

#calculate MAE
y_hat=rfr.predict(x_test)
mae=mean_absolute_error(y_hat,y_test)
print('IQ MAE: ',mae)
run.log('IQ MAE: ',np.float(mae))

# Save the trained model
os.makedirs(model_folder, exist_ok=True)
output_path = model_folder + "/iq_rfr_model.pkl"
joblib.dump(value=rfr, filename=output_path)

run.complete()


Writing inputdata/rfr_train_iq.py


In [162]:
%%writefile $inputdata_folder/register_rfr_iq.py
# Import libraries
import argparse
import joblib
from azureml.core import Workspace, Model, Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--model_folder', type=str, dest='model_folder')
args = parser.parse_args()
model_folder = args.model_folder
print('Model folder',str(model_folder))

# Get the experiment run context
run = Run.get_context()

# load the model
print('Loading model from ' + model_folder)
model_file = model_folder + '/iq_rfr_model.pkl'
model = joblib.load(model_file)

Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'iq_rfr_model',
               tags={'Training context':'Pipeline'})

run.complete()

Overwriting inputdata/register_rfr_iq.py


#### Create Compute Environment for Model Pipeline
##### Compute Cluster

In [108]:
#Create a compute cluster if it does not exist
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Workspace

ws = Workspace.from_config(path='.azureml/ws_config.json')

cluster_name = "DS-Comp-Cluster"

try:
    #Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    #If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


##### Python Environment on the Cluster

In [109]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment
dengue_env = Environment("dengue-pipeline-env")
dengue_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
dengue_env.docker.enabled = True # Use a docker container

# Create a set of package dependencies
dengue_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'],
                                             pip_packages=['azureml-defaults','azureml-dataprep[pandas]','keras','tensorflow'])

# Add the dependencies to the environment
dengue_env.python.conda_dependencies = dengue_packages

# Register the environment (just in case you want to use it again)
dengue_env.register(workspace=ws)
registered_env = Environment.get(ws, 'dengue-pipeline-env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


#### Create Model Pipelines
##### Pipeline for SJ

In [154]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep,EstimatorStep
from azureml.train.estimator import Estimator

#Create a PipelineData object 
ws=Workspace.get(name='Azure-ML-WS',subscription_id='fd2d8de8-17e1-4976-9906-fdde487edd5f',resource_group='AzureML-Learning')
data_store=ws.get_default_datastore()
dengueAI_datasets=PipelineData('deng_datasets',datastore=data_store)
model_folder=PipelineData('model_folder',datastore=data_store)


#create estimator to run the model
estimator = Estimator(source_directory=inputdata_folder,
                        compute_target = pipeline_cluster,
                        environment_definition=pipeline_run_config.environment,
                        entry_script='rfr_train_sj.py')

#Step 1, prepare data for the sj model by creating time-lagged features and scaling data
create_rfr_datasets = PythonScriptStep(name = 'Create SJ Datasets for RFR Model',
                                       source_directory = inputdata_folder,
                                       script_name = 'create_rfr_datasets.py',
                                       arguments = ['--folder', dengueAI_datasets],
                                       inputs=[],
                                       outputs=[dengueAI_datasets],
                                       compute_target = pipeline_cluster,
                                       runconfig = pipeline_run_config,
                                       allow_reuse = True)

#Step 2, create and train random forest regressor for sj
rfr_train_sj = EstimatorStep(name = 'Create sj random forest regressor model',
                             estimator=estimator,
                             estimator_entry_script_arguments = ['--folder',dengueAI_datasets,'--model_folder',model_folder],
                             inputs=[dengueAI_datasets],
                             outputs=[model_folder],
                             compute_target = pipeline_cluster,
                             allow_reuse = True)

#Step 3, register the model
register_rfr_sj = PythonScriptStep(name = 'Register RFR Model for SJ',
                                source_directory = inputdata_folder,
                                script_name = 'register_rfr_sj.py',
                                arguments = ['--model_folder', model_folder],
                                inputs=[model_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


##### Run SJ Pipeline

In [159]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

#Construct the pipeline
pipeline_steps=[create_rfr_datasets,rfr_train_sj,register_rfr_sj]
pipeline = Pipeline(workspace=ws,steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment=Experiment(workspace=ws, name='dengue-sj-randomforest-pipeline')
pipeline_run=experiment.submit(pipeline,regenerate_outputs=True)
print("Pipeline submitted for execution.")
pipeline_run.wait_for_completion(show_output=False)

Pipeline is built.
Created step Create IQ Datasets for RFR Model [44367085][9e94323e-a224-41ac-8838-ac898b9d60dc], (This step will run and generate new outputs)
Created step Create sj random forest regressor model [c69a2562][ab0a3a1e-4397-481b-935d-f52ae914d9c8], (This step will run and generate new outputs)
Created step Create SJ Datasets for RFR Model [7c114c18][27169125-fbed-4eda-8530-ad9d0c877968], (This step will run and generate new outputs)
Created step Register RFR Model for SJ [665d79a3][5095e1d0-e5a2-488c-8756-7f4740e71a77], (This step will run and generate new outputs)
Submitted PipelineRun 82151df3-f78f-49cf-9c06-ec181271bd44
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/dengue-sj-randomforest-pipeline/runs/82151df3-f78f-49cf-9c06-ec181271bd44?wsid=/subscriptions/fd2d8de8-17e1-4976-9906-fdde487edd5f/resourcegroups/AzureML-Learning/workspaces/Azure-ML-WS
Pipeline submitted for execution.
PipelineRunId: 82151df3-f78f-49cf-9c06-ec181271bd44
Link to Az

'Finished'

##### Create Pipeline for IQ

In [160]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep,EstimatorStep
from azureml.train.estimator import Estimator

#Create a PipelineData object 
ws=Workspace.from_config(path='.azureml/ws_config.json')
data_store=ws.get_default_datastore()
dengueAI_datasets=PipelineData('deng_datasets',datastore=data_store)
model_folder=PipelineData('model_folder',datastore=data_store)


#create estimator to run the model
estimator = Estimator(source_directory=inputdata_folder,
                        compute_target = pipeline_cluster,
                        environment_definition=pipeline_run_config.environment,
                        entry_script='rfr_train_iq.py')

#Step 1, prepare data for the sj model by creating time-lagged features and scaling data
create_rfr_datasets = PythonScriptStep(name = 'Create IQ Datasets for RFR Model',
                                       source_directory = inputdata_folder,
                                       script_name = 'create_rfr_datasets.py',
                                       arguments = ['--folder', dengueAI_datasets],
                                       inputs=[],
                                       outputs=[dengueAI_datasets],
                                       compute_target = pipeline_cluster,
                                       runconfig = pipeline_run_config,
                                       allow_reuse = True)

#Step 2, create and train random forest regressor for sj
rfr_train_iq = EstimatorStep(name = 'Create iq random forest regressor model',
                             estimator=estimator,
                             estimator_entry_script_arguments = ['--folder',dengueAI_datasets,'--model_folder',model_folder],
                             inputs=[dengueAI_datasets],
                             outputs=[model_folder],
                             compute_target = pipeline_cluster,
                             allow_reuse = True)

#Step 3, register the model
register_rfr_iq = PythonScriptStep(name = 'Register RFR Model for IQ',
                                source_directory = inputdata_folder,
                                script_name = 'register_rfr_iq.py',
                                arguments = ['--model_folder', model_folder],
                                inputs=[model_folder],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


##### Run IQ Pipeline

In [161]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

#Construct the pipeline
pipeline_steps=[create_rfr_datasets,rfr_train_iq,register_rfr_iq]
pipeline = Pipeline(workspace=ws,steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment=Experiment(workspace=ws, name='dengue-iq-randomforest-pipeline')
pipeline_run=experiment.submit(pipeline,regenerate_outputs=True)
print("Pipeline submitted for execution.")
pipeline_run.wait_for_completion(show_output=False)

Pipeline is built.
Created step Create IQ Datasets for RFR Model [1c6abb00][068a97dd-396f-44de-8fd1-b0b84af1d930], (This step will run and generate new outputs)
Created step Create iq random forest regressor model [bd0cd3dd][42ec92e7-4a6d-4e91-b8a1-426d45964e64], (This step will run and generate new outputs)
Created step Register RFR Model for IQ [e89548c4][9733f18f-4faa-4767-a4af-9b56de17e6ea], (This step will run and generate new outputs)
Submitted PipelineRun 13625d0c-ddb2-4030-a7f9-bac9d2aa1ac5
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/dengue-iq-randomforest-pipeline/runs/13625d0c-ddb2-4030-a7f9-bac9d2aa1ac5?wsid=/subscriptions/fd2d8de8-17e1-4976-9906-fdde487edd5f/resourcegroups/AzureML-Learning/workspaces/Azure-ML-WS
Pipeline submitted for execution.
PipelineRunId: 13625d0c-ddb2-4030-a7f9-bac9d2aa1ac5
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/dengue-iq-randomforest-pipeline/runs/13625d0c-ddb2-4030-a7f9-bac9d2aa1ac5?wsid=/

'Failed'