<a href="https://colab.research.google.com/github/FranciscoOcampoPredictiva/azureml_course/blob/main/Model_Training_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the class
from azureml.core import Run

In [None]:
# Get the run context
new_run = Run.get_context()

In [None]:
# Access the workspace
ws = new_run.experiment.workspace

In [None]:
# Get the arguments from the pipeline job
from argparse import ArgumentParser as AP
parser = AP()
parser.add_argument('--datafolder', type=str) # Adding the argument
args = parser.parse_args()  # Passing the arguments in args

In [None]:
# read the data from previous step
import os
import pandas as pd

# Create the path
path = os.path.join(args.datafolder, 'churn_prep.csv')
dataPrep = pd.read_csv(path)

In [None]:
# Define X and Y
X = dataPrep.drop(['Exited'], axis=1)
Y = dataPrep[['Exited']]

In [None]:
# Split the dataset into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0, stratify=Y)

In [None]:
# Build and train the Logistic Regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)

In [None]:
# Predict the output - Scored Label
Y_predict = classifier.predict(X_test)

In [None]:
# Scored Probabilities
Y_prob = classifier.predict_proba(X_test)[:, 1]

In [None]:
# Confusion matrix and accuracy score
from sklearn.metrics import confusion_matrix
cm    = confusion_matrix(Y_test, Y_predict)
score = classifier.score(X_test, Y_test)

In [None]:
# Log metrics

# Create the confusion matrix dictionary
cm_dict = {"schema_type": "confusion_matrix",
           "schema_version": "v1",
           "data": {"class_labels": ["N", "Y"],
                    "matrix": cm.tolist()}
           }

new_run.log('TotalObservations', len(dataPrep))
new_run.log_confusion_matrix('ConfusionMatrix', cm_dict)
new_run.log('Score', score)

In [None]:
# Create the Scored Dataset and upload to outputs folder

X_test = X_test.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

Y_prob_df    = pd.DataFrame(Y_prob, columns=['Scored Probabilities'])
Y_predict_df = pd.DataFrame(Y_predict, columns=['Scored Label'])

scored_dataset = pd.concat([X_test, Y_test, Y_predict_df, Y_prob_df], axis=1)

In [None]:
# Upload the scored dataset
scored_dataset.to_csv('./outputs/churn_scored.csv', index=False)

In [None]:
# Complete the run
new_run.complete()