In [1]:
import os

# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')

src folder created


In [2]:
%%writefile src/insurance-training.py
import argparse
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')


def main(args):
    # function that reads the data
    df = get_data(args.input_data)

    # Clean the data for duplicates and outliers
    new_data = clean_data(df)

    # Split data for training and testing
    X_train, X_test, y_train, y_test = split_data(new_data)

    # Input data into model
    data_model(X_train, X_test, y_train, y_test, args)


# Function that reads the data
def get_data(path):
    print("Reading data ...")
    df = pd.read_csv(path)
    return df


def clean_data(data):
    df_copy = data.copy()

    # Check for duplicates
    duplicates = df_copy.duplicated()
    print(f'Number of duplicate rows: {duplicates.sum()}')

    # Remove duplicate rows
    df_copy = df_copy.drop_duplicates()

    # Retrieve index location for prices greater than 50k and drop
    high_price = df_copy[df_copy['charges'] >= 50000].index
    df_copy.drop(high_price, axis=0, inplace=True)

    # Label encoding the categorical columns
    le_sex = LabelEncoder()
    le_smoker = LabelEncoder()

    # Fit and transform the 'sex' and 'smoker' columns
    df_copy['sex'] = le_sex.fit_transform(df_copy['sex'])
    df_copy['smoker'] = le_smoker.fit_transform(df_copy['smoker'])

    # Apply pd.get_dummies to encode the 'region' column
    df_encoded = pd.get_dummies(df_copy, columns=['region'], dtype=float)

    return df_encoded


def split_data(cleaned_data):
    y = cleaned_data['charges'].values
    cleaned_data.drop('charges', axis=1, inplace=True)
    X = cleaned_data.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test


def data_model(X_train, X_test, y_train, y_test, args):
    # Start a new MLflow run
    with mlflow.start_run():
        # Model parameters
        n_estimators = args.n_estimators
        learning_rate = args.learning_rate
        max_depth = args.max_depth
        random_state = args.random_state

        # Initialize and train the GradientBoostingRegressor
        gbr = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
        model = gbr.fit(X_train, y_train)

        # Save the model
        # Log the model with MLflow
        mlflow.sklearn.log_model(model, "model")

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Evaluate the model
        mean_abs_error = mean_absolute_error(y_test, y_pred)
        mean_sq_error = mean_squared_error(y_test, y_pred)
        root_mean_sq_error = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        # Plot predicted vs actual values
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, color='blue', edgecolor='k', alpha=0.7, s=100)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.title('Predicted vs Actual Values')
        plt.savefig('Regression-line.png')

        # Log parameters and metrics
        params = {
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "max_depth": max_depth,
            "random_state": random_state
        }
        mlflow.log_params(params)

        metrics = {
            "mean_absolute_error": mean_abs_error,
            "mean_squared_error": mean_sq_error,
            "root_mean_squared_error": root_mean_sq_error,
            "R2": r2
        }
        mlflow.log_metrics(metrics)
        mlflow.log_artifact("Regression-line.png")


def parse_args():
    # Setup arg parser
    parser = argparse.ArgumentParser()

    # Add arguments
    parser.add_argument("--input_data", dest='input_data', type=str, required=True)
    parser.add_argument("--n_estimators", dest='n_estimators', type=int, default=100)
    parser.add_argument("--learning_rate", dest='learning_rate', type=float, default=0.1)
    parser.add_argument("--max_depth", dest='max_depth', type=int, default=3)
    parser.add_argument("--random_state", dest='random_state', type=int, default=123)

    # Parse args
    args = parser.parse_args()

    # Return args
    return args


# Run script
if __name__ == "__main__":
    # Add space in logs
    print("\n\n")
    print("*" * 60)

    # Parse args
    args = parse_args()

    # Run main function
    main(args)

    # Add space in logs
    print("*" * 60)
    print("\n\n")



Overwriting src/insurance-training.py


In [3]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

In [4]:
# Get a handle to workspace
ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [5]:
from azure.ai.ml import Input, Output
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import command

# Configure input and output
my_job_inputs = {
    "local_data": Input(type=AssetTypes.URI_FILE, path="azureml:insurance-price:1")
}


# Configure job
job = command(
    code="./src",
    command="python insurance-training.py --input_data ${{inputs.local_data}}",
    inputs=my_job_inputs,
    environment="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu@latest",
    compute="captgt0071",
    display_name="insurance_price",
    experiment_name="insurance_price_charge"
)

# Submit job
price_job = ml_client.create_or_update(job)
view_job = price_job.studio_url
print("Monitor your job at", view_job)


Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading src (0.01 MBs): 100%|██

Monitor your job at https://ml.azure.com/runs/busy_glass_7sfcpx5b0m?wsid=/subscriptions/cda9116f-5326-4a9b-9407-bc3a4391c27c/resourcegroups/data_udemy/workspaces/dala_project&tid=aef6e45c-850f-4f38-a10b-1df3ad33cdb0
