In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

  "class": algorithms.Blowfish,


Ready to use Azure ML 1.43.0 to work with akashbws


In [2]:
from azureml.core import Workspace, Dataset
dataset = Dataset.get_by_name(ws, name='salary_classification')
df = dataset.to_pandas_dataframe()

## Creating Environment

In [13]:
from azureml.core import Experiment, ScriptRunConfig, Environment
env = Environment.from_conda_specification("adult_classification_env", "D:/Adult Classification Project/Adult-Income-Classification/environment_droplet.yml")

In [14]:
## if using local
from azureml.core import Environment

myenv = Environment("user-managed-env")
myenv.python.user_managed_dependencies = True

In [40]:
%%writefile dataprep_experiment.py
import azureml.core
from azureml.core import Workspace, Dataset, Run
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib_inline

# Load the workspace from the saved config file
ws = Workspace.from_config()
print(ws)

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
dataset = Dataset.get_by_name(ws, name='salary_classification')
df = dataset.to_pandas_dataframe()


# Count the rows and log the result
row_count = (len(df))
run.log('observations', row_count)
print('Analyzing {} rows of data'.format(row_count))

# Count and log the label counts
salary_counts = df['salary'].value_counts()
print(salary_counts)
for k, v in salary_counts.items():
    run.log('Label:' + str(k), v)

## imbalanced dataset figure
fig = plt.figure(figsize=(6,6))
sns.countplot(x = 'salary',data = df)
plt.show()
run.log_image(name='label distribution', plot=fig)

#percentage of imbalanceness
print(f"<= 50k : {round(24720 /32561 * 100 , 2)}")
print(f"> 50k : {round(7841 /32561 * 100 , 2)}")

run.log('% of <= 50k',round(24720 /32561 * 100 , 2))
run.log('% of > 50k', round(7841 /32561 * 100 , 2))


# Log summary statistics for numeric columns
med_columns = ['age','fnlwgt', 'education-num','capital-gain', 'capital-loss', 'hours-per-week']
summary_stats = df[med_columns].describe().to_dict()
for col in summary_stats:
    keys = list(summary_stats[col].keys())
    values = list(summary_stats[col].values())
    for index in range(len(keys)):
        run.log_row(col, stat=keys[index], value = values[index])
        
# Log summary statistics for cat columns
cat_columns = ['workclass','education','marital-status', 'occupation', 'relationship','race', 'sex','country','salary']
cat_summary_stats = df[cat_columns].describe().to_dict()
for col in cat_summary_stats:
    keys = list(cat_summary_stats[col].keys())
    values = list(cat_summary_stats[col].values())
    for index in range(len(keys)):
        run.log_row(col, stat=keys[index], value = values[index])
        
# Complete the run
run.complete()

Overwriting dataprep_experiment.py


In [19]:
import azureml.core
from azureml.core import Experiment, ScriptRunConfig, Environment,Workspace
## from azureml.core.runconfig import DockerConfiguration
from azureml.widgets import RunDetails
import os, shutil


# Load the workspace from the saved config file
ws = Workspace.from_config()

# Create a folder for the experiment files
folder_name = 'research_env'
experiment_folder = './'

# Create a Python environment for the experiment (from a .yml file)
env = Environment.from_conda_specification("adult_income_environment", "D:/Adult Classification Project/Adult-Income-Classification/environment.yml")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='dataprep_experiment.py',
                                environment = env)
                                

# submit the experiment
experiment = Experiment(workspace=ws, name='adult_income_classification')
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

KeyboardInterrupt: 

In [29]:
from azureml.core import Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

ws = Workspace.from_config() # This automatically looks for a directory .azureml

# Choose a name for your CPU cluster
cpu_cluster_name = "classification-cpu-cluster"

# Verify that the cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4, 
                                                           idle_seconds_before_scaledown=2400)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## registering env

In [30]:
from azureml.core import Experiment, ScriptRunConfig, Environment
experiment_env = Environment.from_conda_specification("adult_classification_env", "D:/Adult Classification Project/Adult-Income-Classification/environment.yml")
experiment_env.register(ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20220616.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "adult_classification_env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "dependencies": [
                "python=3.6.2",
           

## creating pipeline

In [39]:
%%writefile prep_adult_income.py
# Import libraries
import os
import argparse
import pandas as pd
import seaborn as sns
import numpy as np
import azureml.core
from azureml.core import Workspace, Dataset, Run
from imblearn.combine import SMOTEENN
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib_inline


# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data
print('created folder name for storing prepped data as', save_folder)

# Load the workspace from the saved config file
ws = Workspace.from_config()

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()

#wks = run.experiment.workspace
#raw_dataset = Dataset.get_by_id(wks, id=args.raw_dataset_id)
#df = raw_dataset.to_pandas_dataframe()

##dataset = Dataset.get_by_name(ws, name='salary_classification')
##df = dataset.to_pandas_dataframe()


# Log raw row count
row_count = (len(df))
run.log('raw_rows', row_count)
print("Logging row count:",row_count)

# we will apply log transformation on age and fnlwgt but not oncapital loss, capital gain as log(0) will give n.d
print('Applying Log Transformation to age and fnlwgt columns')
df["age"] = np.log(df["age"])
df["fnlwgt"] = np.log(df["fnlwgt"])


# missing percentage values for following columns:
print('missing percentage values for following columns are:')
print(f"workclass : {round(2093 / 32561 , 4) *100}%")
print(f"occupation : {round(1843 / 32561 , 4) *100}%")
print(f"native-country : {round(583 / 32561 , 4) *100}%")

## filling with modes
print('filling missing values by mode on workclass, occupation and country columns')
df["workclass"] = df['workclass'].str.replace('?', 'Private' )
df['occupation'] = df['occupation'].str.replace('?', 'Prof-specialty' )
df['country'] = df['country'].str.replace('?', 'United-States')

# reduced education unique categories
print('Reducing the unique counts from educationan and marital Status')
df["education"].replace(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th','10th', '11th', '12th'], 'school' ,
                         inplace = True , regex = True)
df["education"].replace(['Assoc-voc', 'Assoc-acdm', 'Prof-school', 'Some-college'], 'higher' , inplace = True , regex = True)

#reduced mariatal-status unique categories
df['marital-status'].replace(['Married-civ-spouse', 'Married-AF-spouse'], 'married' , inplace = True , regex = True)
df['marital-status'].replace(['Divorced', 'Separated','Widowed',
                                                   'Married-spouse-absent'], 'other' , inplace = True , regex = True)
# converting salary columns to binary
print('Converting salary feature to binary')
df["salary"] = df["salary"].replace({'<=50K' : 0 , ">50K" : 1 } , regex = True)

#checking the unique count now
cat_columns = df.select_dtypes(include='object')
cat_columns.columns
## checking the reduced unique count now
print('Checking the reduced unique count now')
for feature in cat_columns.columns:
    print(f" {feature}  :  {len(df[feature].unique())}")
    

# plotting heatmap
print('plotting heat map')
heatmap_fig = plt.figure(figsize=(20,10), dpi = 150)
sns.heatmap(df.corr(), annot = True, cmap = 'viridis')
plt.show()
run.log_image(name='correlation plot', plot=heatmap_fig)



## applied label encoding for entire dataset
print('applying label encoding for entire dataset')
from sklearn.preprocessing import  LabelEncoder
df = df.apply(LabelEncoder().fit_transform)


## splitting the data
print('Splitting the data in X and y')
X = df.drop(['salary'], axis =1)
y = df['salary']

# handling imbalanced dataset
print('Handling imbalanced dataset')
SMOTEENN = SMOTEENN(n_jobs=-1)
print('Original dataset shape %s' % Counter(y))
X_res, y_res = SMOTEENN.fit_resample(X, y)
print('After undersample dataset shape %s' % Counter(y_res))
labeldf = pd.DataFrame(y_res,columns=['salary'])

##label count after undersampling
label_counts = Counter(y_res)
print('label count of <=50k ', label_counts[0])
print('label count of >50k ', label_counts[1])

# plotting balanced data graph
print('plotting balanced data graph')
balanced_data_fig = plt.figure(figsize=(6,6))
sns.countplot(labeldf['salary'])
##plt.show()
run.log_image(name='Balanced salary distribution', plot=balanced_data_fig)

#clubbing the data
print('clubbing the data to be used for training')
processed_df = pd.concat([X_res, y_res], axis=1)

# Log processed rows
processed_row_count = len(processed_df)
run.log('processed_rows', processed_row_count)
print('Transformed {} rows of data'.format(processed_row_count))

#percentage of balanceness
print('percentages of balanced data..')
print('---------------------------------------------------------------------------------------------------------------------')
print(f"<= 50k : {round(label_counts[0] /processed_row_count * 100 , 2)}")
print(f"> 50k : {round(label_counts[1] /processed_row_count * 100 , 2)}")

run.log('% of <= 50k',round(label_counts[0] /processed_row_count * 100 , 2))
run.log('% of > 50k', round(label_counts[1] /processed_row_count * 100 , 2))


# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
df.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

Overwriting prep_adult_income.py


In [43]:
%%writefile train_adult_salary.py
# Import libraries
from azureml.core import Run, Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score,f1_score,accuracy_score,precision_score,recall_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib_inline
import seaborn as sns
from azureml.interpret import ExplanationClient
from interpret.ext.blackbox import TabularExplainer

labels = ['<=50K', '>50K']
features = ['age', 'workclass', 'fnlwgt', 'education', 'education-num','marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'country']
# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data
print('training data', training_data)



# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data for Training...")
file_path = os.path.join(training_data,'data.csv')
df = pd.read_csv(file_path)

# Separate features and labels
print("Splitting data X and Y...")
X = df.drop(['salary'], axis =1)
y = df['salary']

# Split data into training set and test set
print("Splitting data into X_train and y_train...")
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 101)

# scaling the data
print("Performing Standard Scalar...")
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training a lightGBM model
print('Training a LightGBM Classifier model...')
clf = lgb.LGBMClassifier(boosting_type='goss',objective='binary',n_jobs=-1,n_estimators=200)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print("The ROC-AUC Score obtained on is : " , roc_auc_score(y_test, y_pred))
print("The Macro F1-Score obtained on is : " , f1_score(y_test,  y_pred,average = 'macro'))
print("The F1 scores of each class on are : ",f1_score(y_test,  y_pred,average = None))

# calculate accuracy
acc = np.average(y_pred == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = clf.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))


# calculate precision
precisionscore = precision_score(y_test, y_pred)
print('Precision: ' + str(precisionscore))
run.log('Precision', precisionscore)

# plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
fig = plt.figure(figsize=(6, 4))
# Plot the diagonal 50% line
plt.plot([0, 1], [0, 1], 'k--')
# Plot the FPR and TPR achieved by our model
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
run.log_image(name = "ROC", plot = fig)
plt.show()

#plot confusion matrix
fig = plt.figure(figsize=(6,6))
cm = confusion_matrix(y_test, y_pred )
plt.title('Heatmap of Confusion Matrix', fontsize = 12)
sns.heatmap(cm, annot = True ,  fmt = "d")
##run.log_confusion_matrix(name = "Heatmap of Confusion Matrix", value = cm)
run.log_image(name='Confusion Matrix LGBM', plot=fig)


# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'adult_income_model.pkl')
joblib.dump(value=clf, filename=model_file)

# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'adult_income_model',
               tags={'Training context':'Pipeline'},
               properties={'AUC': np.float(auc), 'Accuracy': np.float(acc), 'Precision': precision_score(y_test, y_pred)})

# Get explanation
explainer = TabularExplainer(clf, X_train, features=features, classes=labels)
explanation = explainer.explain_global(X_test)

# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation, comment='Tabular Explanation')

print('Completed the training')
run.complete()

Overwriting train_adult_salary.py


## creating pipeline steps:

In [41]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Dataset
from azureml.data.datapath import DataPath
# Create a folder for the experiment files
experiment_folder = './'
# creating env
from azureml.core import Experiment, ScriptRunConfig, Environment
experiment_env = Environment.from_conda_specification("adult_classification_env", "D:/Adult Classification Project/Adult-Income-Classification/environment.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'adult_classification_env')

# get compute cluster
pipeline_cluster = ComputeTarget(workspace=ws, name='classification-cpu-cluster')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

# Get the training dataset
adult_income_ds = ws.datasets.get("salary_classification")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
analyse_step = PythonScriptStep(name = "Data Stats",
                                source_directory = experiment_folder,
                                script_name = "dataprep_experiment.py",
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_adult_income.py",
                                arguments = ['--input-data', adult_income_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Normalize,Split,Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "train_adult_salary.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [42]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [analyse_step,prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'adult-income-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Data Stats [c23a027f][11d5f528-0c92-4641-ab12-29533324d6d5], (This step will run and generate new outputs)
Created step Prepare Data [9ed5f5c0][81a82bde-e304-4225-8be8-65a4bb6bb1f3], (This step will run and generate new outputs)
Created step Normalize,Split,Train and Register Model [60dd12c3][ab6781c7-23c2-46f7-ab08-19608c0a48c4], (This step will run and generate new outputs)
Submitted PipelineRun f2448ead-11c0-4888-9e33-f38402166c5c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/f2448ead-11c0-4888-9e33-f38402166c5c?wsid=/subscriptions/2ff60c86-4824-4a6a-bb33-beb7c5b60b23/resourcegroups/machinelearning/workspaces/akashbws&tid=191c77d0-3f43-488a-819a-6a562723ccde
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: f2448ead-11c0-4888-9e33-f38402166c5c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/f2448ead-11c0-4888-9e33-f38402166c5c?wsid=/subscriptions/2ff60c86-4824-4a6a-bb33-beb7c5b60b23/resourcegroups/machinelearning/workspaces/akashbws&tid=191c77d0-3f43-488a-819a-6a562723ccde
PipelineRun Status: Running


StepRunId: 910190b9-d1fc-4068-849d-5d8254e14a6c
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/910190b9-d1fc-4068-849d-5d8254e14a6c?wsid=/subscriptions/2ff60c86-4824-4a6a-bb33-beb7c5b60b23/resourcegroups/machinelearning/workspaces/akashbws&tid=191c77d0-3f43-488a-819a-6a562723ccde
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished
{'runId': '910190b9-d1fc-4068-849d-5d8254e14a6c', 'target': 'classification-cpu-cluster', 'status': 'Completed', 'startTimeUtc': '2022-07-21T15:57:00.922374Z', 'endTimeUtc': '2022-07-21T16:00:29.064109Z', 'services': {}, 'properties': {'ContentSn

'Finished'