In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [5]:
print("Loading Data ....")
diabetes = pd.read_csv('./src/diabetes.csv')

Loading Data ....


In [6]:
# separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# set regularization hyperparameter
reg = 0.01


# train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg)
model = LogisticRegression(C=1/reg, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))

Training a logistic regression model with regularization rate of 0.01
Accuracy: 0.774
AUC: 0.8484437036668493


In [7]:
from azure.ai.ml import command

# configure job
job = command(
    code="./src",
    command="python diabetes-training.py",
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="aml-cluster",
    display_name="diabetes-pythonv2-train",
    experiment_name="diabetes-training"
)

# submit job
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

ModuleNotFoundError: No module named 'azure.ai'

In [None]:
import mltable
registered_data_asset = ml_client.data.get(name='diabetes-table',
version=1)
tbl = mltable.load(f"azureml:/{registered_data_asset.id}")
df = tbl.to_pandas_dataframe()
df.head(5)

In [None]:
import os
# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')
%%writefile $script_folder/move-data.py
# import libraries
import argparse
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
def main(args):
# read data
df = get_data(args.input_data)
output_df = df.to_csv((Path(args.output_datastore) /
"diabetes.csv"), index = False)
# function that reads the data
def get_data(path):
df = pd.read_csv(path)
# Count the rows and print the result
row_count = (len(df))
print('Analyzing {} rows of data'.format(row_count))
return df
def parse_args():
# setup arg parser
parser = argparse.ArgumentParser()
# add arguments
parser.add_argument("--input_data", dest='input_data',
type=str)
parser.add_argument("--output_datastore", dest='output_datastore',
type=str)
# parse args
args = parser.parse_args()
# return args
return args
# run script
if __name__ == "__main__":
# add space in logs
print("\n\n")
print("*" * 60)
# parse args
args = parse_args()
# run main function
main(args)
# add space in logs
print("*" * 60)

In [None]:
from azure.ai.ml import Input, Output
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import command
# configure input and output
my_job_inputs = {
"local_data": Input(type=AssetTypes.URI_FILE,
path="azureml:diabetes-local:1")
}
my_job_outputs = {
"datastore_data": Output(type=AssetTypes.URI_FOLDER,
path="azureml://datastores/blob_training_data/paths/datastore-path")
}
# configure job
job = command(
code="./src",
command="python move-data.py --input_data ${{inputs.local_data}}
--output_datastore ${{outputs.datastore_data}}",
inputs=my_job_inputs,
outputs=my_job_outputs,
environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
compute="aml-cluster",
display_name="move-diabetes-data",
experiment_name="move-diabetes-data"
)
# submit job
returned_job = ml_client.create_or_update(job)
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)