In [None]:
import mlflow

mlflow.__version__

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### TODO Recording

- Open up https://openml.org/ and scroll and show the page
(Open ML can help you train models and run pipelines but we will only use the dataset)
- Search for housing and see that there are many datasets avaialable

### OpenML 

https://openml.org/

https://scikit-learn.org/stable/datasets/loading_other_datasets.html#openml

In [None]:
from sklearn.datasets import fetch_openml

house_price_df = fetch_openml(
    'house_sales', version = 1, as_frame = True, 
    return_X_y = False, parser = 'auto'
)

house_price_df.data.head()

In [None]:
house_price_df.data.drop(columns = ['date', 'zipcode'], inplace = True)

house_price_df.data.sample(5)

In [None]:
house_price_df.data.shape

In [None]:
house_price_df.data = house_price_df.data.drop_duplicates()

house_price_df.data.shape

In [None]:
house_price_df.data['age'] = 2015 - house_price_df.data['yr_built']

# Will be 2015 (a large number) for homes that have never been renovated
house_price_df.data['renovation_age'] = 2015 - house_price_df.data['yr_renovated']

house_price_df.data.sample(5)

In [None]:
fig1, ax1 = plt.subplots(figsize = (8, 6))

sns.histplot(x = 'price', data = house_price_df.data)
plt.show()

In [None]:
fig2, ax2 = plt.subplots(figsize = (8, 6))

sns.boxplot(y = 'price', data = house_price_df.data)
plt.show()

In [None]:
fig3, ax3 = plt.subplots(figsize = (8, 6))

sns.barplot(x = 'waterfront', y = 'price', data = house_price_df.data)
plt.show()

In [None]:
fig4, ax4 = plt.subplots(figsize = (8, 6))

sns.regplot(data = house_price_df.data, x = 'sqft_living', y = 'price')
plt.show()

#### Selecting a few features to keep heatmap easy to read

In [None]:
selected_features = [
    'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 
    'sqft_basement', 'yr_built','yr_renovated','sqft_above', 'age', 'renovation_age']

cormap = house_price_df.data[selected_features].corr()

fig5, ax5 = plt.subplots(figsize = (12, 8))

sns.heatmap(cormap, annot = True)
plt.show()

Summarisation of data can be done using Profiling

In [None]:
pip install pandas_profiling

In [None]:
pip install ipywidgets

### TODO Recording

- Show the Overview
- Scroll and show that we have details for every variable


- Click on Alerts
- Show the text for highly correlated variables
- Scroll below
- In the "Selected Columns" select "price"
- Under Interactions - select "price" on the left and choose "sqft_living", "grade", and "floors" on the right


- Keep scrolling and show "Correlations", "Missing Values" and "Sample"


In [None]:
import ydata_profiling as pp

profile = pp.ProfileReport(house_price_df.data)

profile

### TODO Recording

- After saving the file - switch to Jupyter Home (http://localhost:8888)
- Show the file has been saved
- Click on the HTML file and show that you have the entire analysis of your data

In [None]:
profile.to_file("house_price_profile_report.html")

### TODO Recording

- Go to the MLflow UI
- Show that it is still empty
- Come back to the notebook

### MLflow experiment and runs

Creating MLflow experiment

In [24]:
experiment_id = mlflow.create_experiment(name = 'kc_house_price_prediction')

### TODO Recording

- Go to the UI - show that a new experiment can be seen there
- Go to the terminal window in the current working directory (where the notebook is present)
- Run

`ls -l`

- Show that we have a new mlruns/ folder
- Run

`cd mlruns`

`ls -l`

- Show that there is a directory with the experiment ID

We are removing 'sqft_above' as it is having very high corrrelation(0.88) with 'sqft_living'
Remove bathrooms (highly correlated with bedrooms)
Remove sqft_living15 (highly correlated with sqft_living)

Can remove others if you want


In [19]:
house_price_df.data.drop(columns = [
    'yr_built','yr_renovated','sqft_above', 'bathrooms', 'sqft_living15'], inplace = True)

In [20]:
house_price_df.data.shape

(21608, 15)

In [21]:
house_price_df.data.columns

Index(['price', 'bedrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
       'view', 'condition', 'grade', 'sqft_basement', 'lat', 'long',
       'sqft_lot15', 'age', 'renovation_age'],
      dtype='object')

Creating a features.txt as artifact file

In [22]:
features = """'bedrooms', 'sqft_living', 'sqft_lot', 'floors', 
              'waterfront', 'view', 'condition', 'grade', 
              'sqft_basement', 'lat', 'long', 'sqft_lot15', 
              'age', 'renovation_age'"""

with open("features.txt", "w") as f:
    f.write(features)

If you have checked the mlflow ui, There is one Default experiment with id=0. Get the details of this experiment

In [25]:
experiment = mlflow.get_experiment("0")

print("Name: {}".format(experiment.name))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
print("Creation timestamp: {}".format(experiment.creation_time))

Name: Default
Artifact Location: file:///Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/demo_01_TrackingExperimentRuns/mlruns/0
Tags: {}
Lifecycle_stage: active
Creation timestamp: 1687239878218


 Note that we can also create experiment through mlfow UI. Here ,We are creating a Mlflow experiment programmatically with name 'kc_house_price_prediction'.

Experiment can also be created by MlflowClientAPI

In [26]:
# from mlflow import MlflowClient
# client = MlflowClient()
# client.create_experiment(name = 'kc_house_price_prediction')

As given in docs, we can obtain experiment related details

In [27]:
experiment = mlflow.get_experiment(experiment_id)

print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
print("Creation timestamp: {}".format(experiment.creation_time))

Name: kc_house_price_prediction
Experiment_id: 130781408257073055
Artifact Location: file:///Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/demo_01_TrackingExperimentRuns/mlruns/130781408257073055
Tags: {}
Lifecycle_stage: active
Creation timestamp: 1687240534045


mlflow.get_tracking_uri() returns the current tracking URI.

In [28]:
mlflow.get_tracking_uri()

'file:///Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/demo_01_TrackingExperimentRuns/mlruns'

mlflow.set_experiment() sets an experiment as active. If the experiment does not exist, creates a new experiment. If you do not specify an experiment in mlflow.start_run(), new runs are launched under this experiment.

In [29]:
mlflow.set_experiment(experiment_name = 'kc_house_price_prediction')

<Experiment: artifact_location='file:///Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/demo_01_TrackingExperimentRuns/mlruns/130781408257073055', creation_time=1687240534045, experiment_id='130781408257073055', last_update_time=1687240534045, lifecycle_stage='active', name='kc_house_price_prediction', tags={}>

mlflow.start_run() returns the currently active run (if one exists), or starts a new run and returns a mlflow.

ActiveRun object usable as a context manager for the current run. You do not need to call start_run explicitly: calling one of the logging functions with no active run automatically starts a new one.

If the argument run_name is not set within mlflow.start_run(), a unique run name will be generated for each run.

In [30]:
mlflow.start_run()
 
mlflow.log_figure(fig1, 'histogram_price.png')
mlflow.log_figure(fig2, 'boxplot_price.png')

### TODO Recording

- Go to the UI and click on the kc_house_price_prediction experiment
- Note there is a new run
- Click on the run and under artifacts you should see the two images that we have logged
- Click on those images and show

If we try to start a new run without ending the previous run, Exception is raised.

In [31]:
mlflow.end_run()

Setting some reasonable name for run and logging additional figures

In [32]:
mlflow.start_run(run_name = 'exploratory_data_analysis')
 
mlflow.log_figure(fig1, 'histogram_price.png')
mlflow.log_figure(fig2, 'boxplot_price.png')
mlflow.log_figure(fig3, 'boxplot_waterfront_vs_price.png')
mlflow.log_figure(fig4, 'regplot_sqft_living_vs_price.png') 
mlflow.log_figure(fig5, 'correlation_heatmap.png') 

mlflow.end_run()

### TODO Recording:

- Go to the experiment, there should be a new run called "exploratory_data_analysis"
- Show that these artifacts are logged under that run

Loading feature names as artifact

In [33]:
mlflow.start_run(run_name = 'features_as_artifacts')

mlflow.log_artifact("features.txt")

Getting the active run id and also ending the active run

In [34]:
run = mlflow.active_run()
 
print('Active run_id: {}'.format(run.info.run_id))
 
mlflow.end_run()

Active run_id: 4ae512191c4f4b6f8dadc32bf2103bdd


### TODO Recording:

- Go to the UI
- Show yet another run called "features_as_artifacts"
- Click on the run and show the text file is logged as part of that run
- Note the run ID

The run remains open throughout the with statement, and is automatically closed when the statement exits, even if it exits due to an exception.


In [35]:
with mlflow.start_run(run_name = 'eda_plots_and_features_as_artifacts') as current_run:

    mlflow.log_figure(fig1, 'histogram_price.png')
    mlflow.log_figure(fig2, 'boxplot_price.png')
    mlflow.log_figure(fig3, 'boxplot_waterfront_vs_price.png')
    mlflow.log_figure(fig4, 'regplot_sqft_living_vs_price.png') 
    mlflow.log_figure(fig5, 'correlation_heatmap.png') 
    
    mlflow.log_artifact("features.txt")

In [36]:
mlflow.last_active_run()

<Run: data=<RunData: metrics={}, params={}, tags={'mlflow.runName': 'eda_plots_and_features_as_artifacts',
 'mlflow.source.name': '/Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/mlflow_venv/lib/python3.10/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'vitthalsrinivasan'}>, info=<RunInfo: artifact_uri='file:///Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/demo_01_TrackingExperimentRuns/mlruns/130781408257073055/1f68d4163e814a478314c22deb7cf70a/artifacts', end_time=1687241198077, experiment_id='130781408257073055', lifecycle_stage='active', run_id='1f68d4163e814a478314c22deb7cf70a', run_name='eda_plots_and_features_as_artifacts', run_uuid='1f68d4163e814a478314c22deb7cf70a', start_time=1687241197799, status='FINISHED', user_id='vitthalsrinivasan'>, inputs=<RunInputs: dataset_inputs=[]>>

In [38]:
client = mlflow.MlflowClient()
data = client.get_run(current_run.info.run_id).data

data

<RunData: metrics={}, params={}, tags={'mlflow.runName': 'eda_plots_and_features_as_artifacts',
 'mlflow.source.name': '/Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/mlflow_venv/lib/python3.10/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'vitthalsrinivasan'}>

### TODO Recording

- Go to the UI
- We now have 4 runs within the experiment
- Click on the latest one "eda_plots_and_features_as_artifacts"
- Show this has plots and the text file

List of experiments are obtained

In [39]:
experiments = mlflow.search_experiments()

experiments 

[<Experiment: artifact_location='file:///Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/demo_01_TrackingExperimentRuns/mlruns/130781408257073055', creation_time=1687240534045, experiment_id='130781408257073055', last_update_time=1687240534045, lifecycle_stage='active', name='kc_house_price_prediction', tags={}>,
 <Experiment: artifact_location='file:///Users/vitthalsrinivasan/Desktop/iMovieLibrary/Skillsoft/MLOps/MLFlow/final_code/demo_01_TrackingExperimentRuns/mlruns/0', creation_time=1687239878218, experiment_id='0', last_update_time=1687239878218, lifecycle_stage='active', name='Default', tags={}>]

### Logging metrics during model training and evaluation

We can log each metric individually.After completion of 'Simple Linear Regression' run, we can see that only metrics are logged

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


with mlflow.start_run(run_name = 'Simple Linear Regression') as lr_run:

    X = house_price_df.data.drop(columns = ['price'])
    y = house_price_df.data['price']
    
    X_sqft_living = X['sqft_living']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_sqft_living , y, test_size = 0.3, random_state = 123)

    lr_model = LinearRegression()
    lr_model.fit(X_train.to_numpy().reshape(-1, 1), y_train)
    
    y_pred = lr_model.predict(X_test.to_numpy().reshape(-1, 1))
    
    # Training and test metrics
    training_score = lr_model.score(X_train.to_numpy().reshape(-1, 1), y_train)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    root_mean_sq_error = mean_squared_error(y_test, y_pred, squared = False)
    test_score = r2_score(y_test, y_pred)
   
    # Log model metrics
    mlflow.log_metric('Train_R2_score', training_score)
    mlflow.log_metric('Test_R2_score', test_score)
    mlflow.log_metric('Test_MAE', mean_abs_error)
    mlflow.log_metric('Test_RMSE', root_mean_sq_error)

    mlflow.set_tag('Regressor', 'Simple Linear Regression Model')

### TODO Recording:

- Go to the UI, new run called "Simple Linear Regression"
- Click through and show that the run has metrics and a tag (no artifacts, nothing else)

We can also view Model parameters and metrics using client api

In [42]:
client = mlflow.tracking.MlflowClient()
 
print('Model Parameters', client.get_run(lr_run.info.run_id).data.params)
 
print('Metrics', client.get_run(lr_run.info.run_id).data.metrics)

Model Parameters {}
Metrics {'Train_R2_score': 0.49166868058638125, 'Test_R2_score': 0.4950098469357982, 'Test_RMSE': 269918.61814241024, 'Test_MAE': 174998.84435471724}


We can also log parameters of model individually but we are logging both parameters and metrics together. We can also set tags 

In [45]:
with mlflow.start_run(run_name = 'Multiple Linear Regression') as mlr_run:

    # Log artifacts for EDA and features
    mlflow.log_figure(fig5, 'correlation_heatmap.png') 
    mlflow.log_artifact("features.txt")
    
    X = house_price_df.data.drop(columns = ['price'])
    y = house_price_df.data['price']
    
    X_train, X_test, y_train, y_test = \
        train_test_split(X , y, test_size = 0.3, random_state = 123)
    
    lr_model = LinearRegression()
    
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    
    # Metrics
    training_score = lr_model.score(X_train,y_train)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    root_mean_sq_error = mean_squared_error(y_test, y_pred, squared = False)
    test_score = r2_score(y_test, y_pred)
    
    # Log params    
    params_reg = lr_model.get_params()
    mlflow.log_params(params_reg)
    
    # Log metrics as dictionary
    metrics = {
        'Train_R2_score': training_score, 
        'Test_R2_score': test_score,
        'Test_MAE': mean_abs_error, 
        'Test_RMSE': root_mean_sq_error 
    }
    mlflow.log_metrics(metrics)
    
    mlflow.set_tag('Regressor', 'Multiple Linear Regression Model')
    
    print('Model Parameters', client.get_run(mlr_run.info.run_id).data.params)
    print('Metrics', client.get_run(mlr_run.info.run_id).data.metrics)

Model Parameters {'positive': 'False', 'copy_X': 'True', 'fit_intercept': 'True', 'n_jobs': 'None'}
Metrics {'Train_R2_score': 0.7014489032098928, 'Test_R2_score': 0.6706317206942887, 'Test_RMSE': 217987.84220380947, 'Test_MAE': 129193.79215236727}


### TODO Recording:

- Go to the UI and see the run "Multiple Linear Regression Model"
- Click through and show that the run has logged artifacts, parameters, metrics, and tags

Here we are applying polynomial regression model which includes interactions between features. Here we are logging only degree of polynomial as parameter

In [46]:
from sklearn.preprocessing import PolynomialFeatures

with mlflow.start_run(run_name = 'Polynomial Regression') as pr_run:
    
    degree = 2
    
    # Create polynomial features
    poly = PolynomialFeatures(degree = degree)
    
    poly_features_train = poly.fit_transform(X_train)
    poly_features_test = poly.transform(X_test)
    
    polylr_model = LinearRegression()
    
    polylr_model.fit(poly_features_train, y_train)
    y_pred = polylr_model.predict(poly_features_test)
    
    # Metrics
    training_score = polylr_model.score(poly_features_train,y_train)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    root_mean_sq_error = mean_squared_error(y_test, y_pred, squared = False)
    test_score = r2_score(y_test, y_pred)
    
    # Log params
    mlflow.log_param('Degree of polynomial', degree)
    
    # Log metrics
    metrics = {
        'Train_R2_score': training_score, 
        'Test_R2_score': test_score,
        'Test_MAE': mean_abs_error, 
        'Test_RMSE': root_mean_sq_error 
    }
    mlflow.log_metrics(metrics)

    mlflow.set_tag('Regressor', 'Polynomial Regression Model with degree 2')
    
    print('Model Parameters', client.get_run(pr_run.info.run_id).data.params)
    print('Metrics', client.get_run(pr_run.info.run_id).data.metrics)

Model Parameters {'Degree of polynomial': '2'}
Metrics {'Train_R2_score': 0.8053902037779248, 'Test_R2_score': 0.7761844602437731, 'Test_RMSE': 179695.21197049692, 'Test_MAE': 109155.83489548085}


### TODO Recording:

- Go to the UI
- Click on the new polynomial regression run
- Show everything that we have logged

In [None]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run(run_name = 'Random Forest Regression default params') as rf_run:
   
    rf_model = RandomForestRegressor()
    
    rf_model .fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    
    training_score = rf_model.score(X_train,y_train)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    root_mean_sq_error = mean_squared_error(y_test, y_pred, squared = False)
    test_score = r2_score(y_test, y_pred)

    # Log params
    params_reg = rf_model.get_params()
    mlflow.log_params(params_reg)

    # Log metrics
    metrics = {
        'Train_R2_score': training_score, 
        'Test_R2_score': test_score,
        'Test_MAE': mean_abs_error, 
        'Test_RMSE': root_mean_sq_error
    }
    mlflow.log_metrics(metrics)
    
    mlflow.set_tag('Regressor', 'RF default parameters')
    
    print('Model Parameters', client.get_run(rf_run.info.run_id).data.params)
    print('Metrics', client.get_run(rf_run.info.run_id).data.metrics)

### TODO Recording:

- Go to the UI
- Click on the new random forest with default params run
- Show everything that we have logged

We can also manually tune parameters and log only the tuned parameters

In [48]:
with mlflow.start_run(run_name = 'Random Forest Regression tuned params') as rft_run:

    n_estimators = 200
    max_depth = 10
    min_samples_split = 5
    min_samples_leaf = 2
   
    rf_model = RandomForestRegressor(
        n_estimators = n_estimators, 
        max_depth = max_depth,
        min_samples_split = min_samples_split, 
        min_samples_leaf = min_samples_leaf)

    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    
    training_score = rf_model.score(X_train,y_train)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    root_mean_sq_error = mean_squared_error(y_test, y_pred, squared = False)
    test_score = r2_score(y_test, y_pred)

    # Log params
    params = {
        'n_estimators': n_estimators, 
        'max_depth': max_depth,
        'min_samples_split': min_samples_split, 
        'min_samples_leaf': min_samples_leaf
    }
    mlflow.log_params(params)
    
    # Log metrics
    metrics = {
        'Train_R2_score': training_score, 
        'Test_R2_score': test_score,
        'Test_MAE': mean_abs_error, 
        'Test_RMSE': root_mean_sq_error
    }
    mlflow.log_metrics(metrics)
    
    mlflow.set_tag('Regressor', 'RF tuned parameters')

### TODO Recording:

- Go to the UI
- Click on the new random forest with tuned params run
- Show everything that we have logged0

##### All runs in the experiment

- Now click on the experiment
- On the page click on Columns
- Show there are many columns that you can now include
- Select Test R2 Score and Regressor (under tags)

##### Sorting

- Sort based on Train_R2_score and show
- Sort based on Test_R2_score and show

We can also search for runs using 'search_runs' and also obtain the data in sorted form in dataframe wrt some metric(here it is Test_R2_score). RF model with default parameters seem to have highest R2 score

In [49]:
df_run_metrics = mlflow.search_runs(
    [experiment.experiment_id], order_by = ['metrics.Test_R2_score DESC'])
 
df_run_metrics

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.Test_RMSE,metrics.Test_R2_score,metrics.Test_MAE,metrics.Train_R2_score,...,params.ccp_alpha,params.Degree of polynomial,params.copy_X,params.positive,params.fit_intercept,tags.mlflow.runName,tags.Regressor,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name
0,fcb5b5c40ff9452f8504c26d1f6a1570,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:47:34.130000+00:00,2023-06-20 06:47:40.046000+00:00,136575.529787,0.870711,71227.291341,0.982888,...,0.0,,,,,Random Forest Regression default params,RF default parameters,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
1,1c336d2db796447194d2c0fa3fbd2ac3,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:50:41.537000+00:00,2023-06-20 06:50:48.599000+00:00,145550.093229,0.853161,77809.56087,0.928586,...,,,,,,Random Forest Regression tuned params,RF-tuned_parameters,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
2,e7a64210f5ec4f52ba687e616cb65701,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:39:37.478000+00:00,2023-06-20 06:39:37.659000+00:00,179695.21197,0.776184,109155.834895,0.80539,...,,2.0,,,,Polynomial Regression,Polynomial Regression Model with degree 2,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
3,36b6952c5cef48a8841a1c1c65f50d04,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:33:59.802000+00:00,2023-06-20 06:33:59.959000+00:00,217987.842204,0.670632,129193.792152,0.701449,...,,,True,False,True,Multiple Linear Regression,Multiple Linear Regression Model,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
4,47b6d2adc0ce4580baf9f02249302fa2,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:29:35.427000+00:00,2023-06-20 06:29:35.478000+00:00,217987.842204,0.670632,129193.792152,0.701449,...,,,True,False,True,Multiple Linear Regression,Multiple Linear Regression Model,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
5,c8351c9cc69c4535839ff82d834bcbd6,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:26:22.284000+00:00,2023-06-20 06:26:22.303000+00:00,269918.618142,0.49501,174998.844355,0.491669,...,,,,,,Simple Linear Regression,Simple Linear Regression Model,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
6,51b34e07d7b743b8af7469c523e581ac,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:13:35.893000+00:00,2023-06-20 06:13:35.930000+00:00,269918.618142,0.49501,174998.844355,0.491669,...,,,,,,Simple Linear Regression,Simple Linear Regression Model,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
7,1f68d4163e814a478314c22deb7cf70a,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:06:37.799000+00:00,2023-06-20 06:06:38.077000+00:00,,,,,...,,,,,,eda_plots_and_features_as_artifacts,,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
8,4ae512191c4f4b6f8dadc32bf2103bdd,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:04:50.060000+00:00,2023-06-20 06:04:57.067000+00:00,,,,,...,,,,,,features_as_artifacts,,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
9,314af1d50e2f4d68a597c619ace6af5b,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:03:20.588000+00:00,2023-06-20 06:03:20.878000+00:00,,,,,...,,,,,,exploratory_data_analysis,,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...


Deleting a specific run

In [50]:
run_id_for_delete = df_run_metrics.loc[5, 'run_id']
 
run_id_for_delete

'c8351c9cc69c4535839ff82d834bcbd6'

We can easily delete any run using UI. Here we are showing programmatically method of deletion of run

In [51]:
mlflow.delete_run(run_id_for_delete)

Checking the deletion

In [52]:
df_run_metrics = mlflow.search_runs(
    [experiment.experiment_id], order_by = ['metrics.Test_R2_score DESC'])
 
df_run_metrics

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.Test_RMSE,metrics.Test_R2_score,metrics.Test_MAE,metrics.Train_R2_score,...,params.ccp_alpha,params.Degree of polynomial,params.copy_X,params.positive,params.fit_intercept,tags.mlflow.runName,tags.Regressor,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.source.name
0,fcb5b5c40ff9452f8504c26d1f6a1570,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:47:34.130000+00:00,2023-06-20 06:47:40.046000+00:00,136575.529787,0.870711,71227.291341,0.982888,...,0.0,,,,,Random Forest Regression default params,RF default parameters,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
1,1c336d2db796447194d2c0fa3fbd2ac3,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:50:41.537000+00:00,2023-06-20 06:50:48.599000+00:00,145550.093229,0.853161,77809.56087,0.928586,...,,,,,,Random Forest Regression tuned params,RF-tuned_parameters,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
2,e7a64210f5ec4f52ba687e616cb65701,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:39:37.478000+00:00,2023-06-20 06:39:37.659000+00:00,179695.21197,0.776184,109155.834895,0.80539,...,,2.0,,,,Polynomial Regression,Polynomial Regression Model with degree 2,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
3,36b6952c5cef48a8841a1c1c65f50d04,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:33:59.802000+00:00,2023-06-20 06:33:59.959000+00:00,217987.842204,0.670632,129193.792152,0.701449,...,,,True,False,True,Multiple Linear Regression,Multiple Linear Regression Model,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
4,47b6d2adc0ce4580baf9f02249302fa2,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:29:35.427000+00:00,2023-06-20 06:29:35.478000+00:00,217987.842204,0.670632,129193.792152,0.701449,...,,,True,False,True,Multiple Linear Regression,Multiple Linear Regression Model,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
5,51b34e07d7b743b8af7469c523e581ac,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:13:35.893000+00:00,2023-06-20 06:13:35.930000+00:00,269918.618142,0.49501,174998.844355,0.491669,...,,,,,,Simple Linear Regression,Simple Linear Regression Model,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
6,1f68d4163e814a478314c22deb7cf70a,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:06:37.799000+00:00,2023-06-20 06:06:38.077000+00:00,,,,,...,,,,,,eda_plots_and_features_as_artifacts,,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
7,4ae512191c4f4b6f8dadc32bf2103bdd,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:04:50.060000+00:00,2023-06-20 06:04:57.067000+00:00,,,,,...,,,,,,features_as_artifacts,,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
8,314af1d50e2f4d68a597c619ace6af5b,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:03:20.588000+00:00,2023-06-20 06:03:20.878000+00:00,,,,,...,,,,,,exploratory_data_analysis,,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...
9,76cc524594c34a5d9eb469d27eba779e,130781408257073055,FINISHED,file:///Users/vitthalsrinivasan/Desktop/iMovie...,2023-06-20 06:01:36.987000+00:00,2023-06-20 06:03:12.029000+00:00,,,,,...,,,,,,omniscient-worm-773,,LOCAL,vitthalsrinivasan,/Users/vitthalsrinivasan/Desktop/iMovieLibrary...


Runs can be searched by using filter string.Only runs with Test R2 score above 0.8 are filtered out

In [55]:
df_run_metrics = mlflow.search_runs(
    [experiment.experiment_id], filter_string = 'metrics.Test_R2_score > 0.8', 
    order_by = ['metrics.Test_R2_score DESC'])
 
df_run_metrics[[
    'run_id', 'metrics.Test_R2_score', 'metrics.Train_R2_score', 'tags.Regressor']]

Unnamed: 0,run_id,metrics.Test_R2_score,metrics.Train_R2_score,tags.Regressor
0,fcb5b5c40ff9452f8504c26d1f6a1570,0.870711,0.982888,RF default parameters
1,1c336d2db796447194d2c0fa3fbd2ac3,0.853161,0.928586,RF-tuned_parameters


Autologging parameters and metrics can also be enabled using calling mlflow.autolog or mlflow.sklearn.autolog

https://mlflow.org/docs/latest/tracking.html#automatic-logging. By going to the specific run page,It can be seen that all regression parameters , training set metrics,tags,artifacts(Model,estimator,metric info) are auto logged

https://learn.microsoft.com/en-gb/azure/databricks/mlflow/databricks-autologging#customize-logging-behavior

### Warning when this code is run

- Does not matter since our data does not have missing values
- Can be fixed by converting our int columns to float so we can deal with missing values (we will do that below)
- Can also fix this by inferring the signature of the model with sample data that has missing values in the int column

In [56]:
mlflow.sklearn.autolog()

with mlflow.start_run(run_name = 'Random Forest Regression tuned params autologged') as rftal_run:

    n_estimators = 200
    max_depth = 10
    min_samples_split = 5
    min_samples_leaf = 2
   
    rf_model = RandomForestRegressor(
        n_estimators = n_estimators, 
        max_depth = max_depth,
        min_samples_split = min_samples_split, 
        min_samples_leaf = min_samples_leaf
    )
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    
    training_score = rf_model.score(X_train,y_train)
    
    # Test metrics - these will NOT be autologged (only training metrics will be autologged)
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    root_mean_sq_error = mean_squared_error(y_test, y_pred, squared = False)
    test_score = r2_score(y_test, y_pred)
    
    mlflow.set_tag('Regressor', 'RF tuned parameters with autolog')



### TODO Recording

- Go to the UI - make sure you sort by the "Created" column in the descending order so the latest run is on top
- Click on the run and show that everything has been logged by default
- Open up each section and show

##### Artifacts

- Under Artifacts click on the model/ folder
- Show that we now have a serialized version of the trained model as an artificact. This can now be registered with the model registry and deployed for predictions
- Click on each file under the model/ folder and show

Explanations for each file here: https://www.mlflow.org/docs/latest/models.html

In [58]:
house_price_df.data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21608 entries, 0 to 21612
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           21608 non-null  float64
 1   bedrooms        21608 non-null  int64  
 2   sqft_living     21608 non-null  int64  
 3   sqft_lot        21608 non-null  int64  
 4   floors          21608 non-null  float64
 5   waterfront      21608 non-null  int64  
 6   view            21608 non-null  int64  
 7   condition       21608 non-null  int64  
 8   grade           21608 non-null  int64  
 9   sqft_basement   21608 non-null  int64  
 10  lat             21608 non-null  float64
 11  long            21608 non-null  float64
 12  sqft_lot15      21608 non-null  int64  
 13  age             21608 non-null  int64  
 14  renovation_age  21608 non-null  int64  
dtypes: float64(4), int64(11)
memory usage: 3.1 MB


In [59]:
house_price_df.data.columns

Index(['price', 'bedrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront',
       'view', 'condition', 'grade', 'sqft_basement', 'lat', 'long',
       'sqft_lot15', 'age', 'renovation_age'],
      dtype='object')

In [67]:
int_columns = [
    'bedrooms', 'sqft_living', 'sqft_lot', 'waterfront',
    'view', 'condition', 'grade', 'sqft_basement', 'lat', 'long',
    'sqft_lot15', 'age', 'renovation_age'
]

for col in int_columns:
    house_price_df.data[col] = house_price_df.data[col].astype(float)

house_price_df.data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21608 entries, 0 to 21612
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           21608 non-null  float64
 1   bedrooms        21608 non-null  float64
 2   sqft_living     21608 non-null  float64
 3   sqft_lot        21608 non-null  float64
 4   floors          21608 non-null  float64
 5   waterfront      21608 non-null  float64
 6   view            21608 non-null  float64
 7   condition       21608 non-null  float64
 8   grade           21608 non-null  float64
 9   sqft_basement   21608 non-null  float64
 10  lat             21608 non-null  float64
 11  long            21608 non-null  float64
 12  sqft_lot15      21608 non-null  float64
 13  age             21608 non-null  float64
 14  renovation_age  21608 non-null  float64
dtypes: float64(15)
memory usage: 3.1 MB


To log post training metrics, Autologging must be enabled before scikit-learn metric APIs are imported from sklearn.metrics. Metric APIs imported before autologging is enabled do not log metrics to MLflow runs.

Note that the warning has disappeared since we treat all numeric columns as floats

In [69]:
mlflow.sklearn.autolog()

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

with mlflow.start_run(run_name = 'Random Forest Regression tuned params and metrics autologged'):

    # Training and test data with integers converted to floats
    X = house_price_df.data.drop(columns = ['price'])
    y = house_price_df.data['price']
    
    X_train, X_test, y_train, y_test = \
        train_test_split(X , y, test_size = 0.3, random_state = 123)

    n_estimators = 200
    max_depth = 10
    min_samples_split = 5
    min_samples_leaf = 2
   
    rf_model = RandomForestRegressor(
        n_estimators = n_estimators, 
        max_depth = max_depth,
        min_samples_split = min_samples_split, 
        min_samples_leaf = min_samples_leaf
    )
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    
    mean_abs_error = mean_absolute_error(y_test, y_pred)
    root_mean_sq_error = mean_squared_error(y_test, y_pred, squared = False)
    test_score = r2_score(y_test, y_pred)
    
    mlflow.set_tag('Regressor', 'RF tuned parameters and autologged metrics')

### TODO Recording

- Go to the UI and show the run
- Click through to the run and open up the "Metrics" section and show that the test metrics that we computed were autologged

### TODO Recording comparing runs

#### Linear regression runs
- In the MLflow UI
- Click on the experiment and show all the runs
- Select Multiple Linear Regression and Polynomial Regression and click on Compare
- In the Parallel Coordinates chart choose 
- ---- Parameters - degree of polynomial, Metrics - Test R2 score
- ---- Add another metric - Train R2 score

- Scroll and show the other differences between the runs
- Click on "diff only"


#### Random forest runs

- Go to experiment and see all runs
- Select Default RF and Tuned RF and click on Compare
- In the Parallel Coordinates chart choose 
- ---- Parameters - n_estimators, max_dept, 
- ----- Metrics - Test R2 score, Train R2 score
- Click on Scatter Plot
- ----- Choose n_estimators on X-axis
- ----- Choose Test_MAE on Y-axis

- Scroll and show the other differences