In [1]:
import os
# create a folder for the script files
script_folder = 'src'
os.makedirs(script_folder, exist_ok=True)
print(script_folder, 'folder created')


src folder created


In [5]:
%%writefile $script_folder/data_preprocess.py
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import numpy as np
import argparse
import os

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--data", type=str, help="Path to the input dataset")
    parser.add_argument("--output", type=str, help="Path to save the processed dataset")

    # parse args
    args = parser.parse_args()
    return args

def preprocess_data(input_path, output_path):
    # Load the dataset
    df = pd.read_csv(input_path)
    
    cols_num = ['age', 'HbA1c_level','bmi', 'blood_glucose_level']
    cols_cat = ['gender','smoking_history']

    # Replace numercial null values with the median
    df[cols_num] = df[cols_num].fillna(df[cols_num].median())
    
    # Handle categorical variables
    # One-hot encode 'smoking_history'
    df = pd.get_dummies(df, columns=cols_cat, drop_first=True)   
    
    
    # Standardization of numerical features
    scaler = StandardScaler()
    df[cols_num] = scaler.fit_transform(df[cols_num])
    
    # Handling outliers by clipping them to 1.5 times the interquartile range
    Q1 = df[cols_num].quantile(0.25)
    Q3 = df[cols_num].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    for column in cols_num:
        df[column] = np.clip(df[column], lower_bound[column], upper_bound[column])
   
    # Convert boolean columns to 1 and 0
    bool_columns = df.select_dtypes(include=['bool']).columns
    df[bool_columns] = df[bool_columns].astype(int)

    
    # Save the processed dataset
    df.to_csv(output_path, index=False)

if __name__ == "__main__":

    args = parse_args()
    
    preprocess_data(args.data, args.output)


Overwriting src/data_preprocess.py


In [6]:
from azure.ai.ml import MLClient, command, Input, Output
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.identity import DefaultAzureCredential

# Initialize MLClient using DefaultAzureCredential
ml_client = MLClient.from_config(credential=DefaultAzureCredential())

# Get the dataset asset from Azure ML workspace
data_asset = ml_client.data.get("Diabetes_Dataset", version="1")

# Define the command job to run the Python script
job = command(
    code ="./src",# Path where the preprocess.py script is located
    command='python data_preprocess.py --data "${{inputs.data}}" --output "${{outputs.processed_data}}"',
    inputs={
        "data": Input(
            path=data_asset.id,
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RO_MOUNT
        )
    },
    outputs={
        "processed_data": Output(
            type=AssetTypes.URI_FILE,
            mode=InputOutputModes.RW_MOUNT
        )
    },
    environment="test-env-azureml:1",# Ensure this environment has all required dependencies
    compute="test-compute-1-mlstudio",
      
      
)

# Create or update the job in Azure ML
returned_job = ml_client.jobs.create_or_update(job)

Found the config file in: /config.json
[32mUploading src (0.0 MBs): 100%|██████████| 4896/4896 [00:00<00:00, 42826.78it/s]
[39m



In [7]:
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

Monitor your job at https://ml.azure.com/runs/gentle_island_3055db9n2m?wsid=/subscriptions/3b7a65ed-df6d-4020-9010-5585f2149752/resourcegroups/rg-test-1/workspaces/mlstudio-test-1&tid=dc0b52a3-68c5-44f7-881d-9383d8850b96
