# Model Build Notebook
This notebook will walk through all steps to build a model.  This notebook should be scheduled on a frequent basis and aligned with the Validation notebook.

You will need to update the cells in this notebook related to your model build, metrics, etc.  You will also need to updae any imports related to the mode building process.  The sample provided demonstrated XGBoost with the iris dataset.



-----------------------------------
## Step 1
#### Install required packages for the model build process.


In [None]:
import time
import pandas as pd
import xgboost as xgb
from pyspark.ml.feature import StringIndexer
import sklearn
import mlflow
from mlflow.entities import ViewType
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score


-----------------------------------
## Step 2
### Start an Experiment and Register with MLFlow

In [None]:
# Get the current notebook name as the key to look up other config
NOTEBOOK_NAME = mssparkutils.runtime.context['currentNotebookName']
#**********************************************

sql_query = "SELECT * FROM MLOpsConfig WHERE notebook ='{}'".format(NOTEBOOK_NAME)
df = spark.sql(sql_query)
df2 =  df.toPandas()
EXPERIMENT_NAME = df2.loc[:,"experiment"].values[0]

mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.autolog()


-----------------------------------
## Step 3
### Read Dataset to be for Model build.

In [None]:
df = spark.read.format("parquet").option("header","true").load("Files/iris/transformed_iris.parquet")

-----------------------------------
## Step 4 
### Data splitting/Create xgb DMatrix

In [None]:
# Split to train/test
df = df.toPandas()
training_df, test_df = train_test_split(df)

# Create DMatrix
dtrain = xgb.DMatrix(training_df[["sepal length","sepal width", "petal length", "petal width"]], label=training_df["variety_index"])


-----------------------------------
## Step 5
### Train Model

In [None]:
mlflow.autolog(exclusive=False)

# Run Name based on date/time
run_name = time.strftime("%Y%m%d-%H%M%S")

# Start MLFlow with the Run Name
with mlflow.start_run(run_name=run_name):
    
    # Write any custom metrics to the run within MLFlow 
    param = {'max_depth': 3, 'eta': 1, 'silent': 1, 'objective': 'multi:softmax'}
    param['nthread'] = 4
    param['eval_metric'] = 'auc'
    param['num_class'] = 6
    mlflow.log_params(param)

    # Train
    num_round = 10
    bst = xgb.train(param, dtrain, num_round)
    dtest = xgb.DMatrix(test_df[["sepal length","sepal width", "petal length", "petal width"]])
    ypred = bst.predict(dtest)

    # Score and write metrics to MLFlow.  We will use these to consider whether or not to promote this to the next environment.
    pre_score = precision_score(test_df["variety_index"],ypred, average='micro')
    print("xgb_pre_score:",pre_score)
    mlflow.log_metric('xgb_pre_score', pre_score)
