In [41]:
# Step 1 : Check where your current working directory. This is very important
import os

# get current working directory
os.getcwd() 

# if by any chance, the current working directory is not the correct one.
# you can use os.chdir(r"the working directory you want") to change it


'c:\\Users\\suhaimi\\Desktop\\Machine Learning Week 2\\Boston House'

In [42]:
# Step 2: Set up ML Flow Experiment
# to import ML Flow
import mlflow

# to create experiment
# mlflow.create_experiment("BostonHousing_Project") # it can run only once, comment if already run once

# to use the experiment / activate the experiment
mlflow.set_experiment("BostonHousing_Project")

# set a tag for the experiment
mlflow.set_experiment_tag("release.version","2.16.2")

In [43]:
# Step 3: Machine Learning Workflow
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

# Data Loading 

column_name = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B 1000','LSTAT','MEDV']

df = pd.read_csv('Dataset/housing.csv', delim_whitespace=(True) ,names=column_name)
df.info()
df.describe()

# Data Preprocessing

X = df
y = X.pop('MEDV')

# train test split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

# Normalize our data (feature scaling)
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

y_train = y_train.values
y_test = y_test.values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B 1000   506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


  df = pd.read_csv('Dataset/housing.csv', delim_whitespace=(True) ,names=column_name)


In [47]:
# Step 4 Running Model Training with MLFlow

# import machine learning packages
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_percentage_error,root_mean_squared_error

# Start the MLFlow
mlflow.start_run(run_name='LinearRegression n-2')

# perform model training
n_jobs = 2
lr = LinearRegression(n_jobs=n_jobs)
lr.fit(X_train,y_train)

# calculate the score 
y_pred = lr.predict(X_test)
r2 = r2_score(y_test,y_pred)
MAE = mean_absolute_percentage_error(y_test,y_pred)
RMSE = root_mean_squared_error(y_test,y_pred)

# Log the score into MLFlow

mlflow.log_metrics({
    'R2 Score':r2,
    'Mean Absolute Percentage Error':MAE,
    'Root Mean Squared Error': RMSE
})

mlflow.log_param("n_jobs", n_jobs)

# End MLFlow run

mlflow.end_run()


In [50]:
# End MLFlow run

mlflow.end_run()

In [48]:
last_run = mlflow.last_active_run()
last_run_id = last_run.info.run_id

# save model
mlflow.sklearn.log_model(
    sk_model=lr,
    artifact_path="sklearn-model",
    registered_model_name = "sk-learn-lr-model",

)

Registered model 'sk-learn-lr-model' already exists. Creating a new version of this model...
Created version '2' of model 'sk-learn-lr-model'.


<mlflow.models.model.ModelInfo at 0x2469c30cd60>

In [51]:
# Step 4 Advance

# use mlflow to do autologging

with mlflow.start_run() as run:

    # to initiate autolog
    mlflow.sklearn.autolog()

    # perform model training
    n_jobs = 4

    lr = LinearRegression(n_jobs=n_jobs)
    lr.fit(X_train,y_train)




In [52]:
from sklearn.linear_model import Lasso,Ridge


In [None]:
# use ML Flow in your pipeline

# create a model dictionary

import mlflow.sklearn


model_dict = {"LinearRegression": LinearRegression(), "Lasso":Lasso(), "Ridge":Ridge()}

# create empty pipelines
pipelines = []

for model_name, mode_class in model_dict.items():
    pipeline = Pipeline([
        ('Scaler', StandardScaler()),
        (model_name,mode_class)
    ])
    # push 'pipeline to your empty pipelines list
    pipelines.append(pipeline)


def train_evaluate(pipeline,X_train,y_train,X_test,y_test):
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    r2 = pipeline.score(X_test,y_test)
    MAE = mean_absolute_percentage_error(y_test,y_pred)
    RMSE = root_mean_squared_error(y_test,y_pred)

r2_list = []
MAE_list = []
RMSE_list = []

for i,pipeline in enumerate(pipeline):
    with mlflow.start_run(run_name=" "):
        mlflow.sklearn.autolog()
        print("Training and evaluating pipeline #",i)
        print("Steps:", pipeline.steps)

        r2, MAE, RMSE = train_evaluate(pipeline, X_train, y_train, X_test, y_test)
        print("R2 Score:", r2)
        print("MAE: ", MAE)
        print("RMSE:", RMSE)

        r2_list.append(r2)
        MAE_list.append(MAE)
        RMSE_list.append(RMSE)

