compare runs, choose models and deploy it to a REST API

In [75]:
import keras
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK,Trials,fmin,hp,tpe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import mlflow
from mlflow.models import infer_signature


In [76]:
## load the dataset
df=pd.read_csv("https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/winequality-white.csv",sep=';',)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [77]:
## we are goiing to predict the quality of the wine
train,test=train_test_split(df,test_size=0.25,random_state=42)
train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
2835,6.3,0.25,0.22,3.30,0.048,41.0,161.0,0.99256,3.16,0.50,10.5,6
1157,7.8,0.30,0.29,16.85,0.054,23.0,135.0,0.99980,3.16,0.38,9.0,6
744,7.4,0.38,0.27,7.50,0.041,24.0,160.0,0.99535,3.17,0.43,10.0,5
1448,7.4,0.16,0.49,1.20,0.055,18.0,150.0,0.99170,3.23,0.47,11.2,6
3338,7.2,0.27,0.28,15.20,0.046,6.0,41.0,0.99665,3.17,0.39,10.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4426,6.2,0.21,0.52,6.50,0.047,28.0,123.0,0.99418,3.22,0.49,9.9,6
466,7.0,0.14,0.32,9.00,0.039,54.0,141.0,0.99560,3.22,0.43,9.4,6
3092,7.6,0.27,0.52,3.20,0.043,28.0,152.0,0.99129,3.02,0.53,11.4,6
3772,6.3,0.24,0.29,13.70,0.035,53.0,134.0,0.99567,3.17,0.38,10.6,6


In [78]:
test['quality'].values

array([7, 8, 8, ..., 6, 7, 6])

## we are keep the test data iin hide mode so that we hust spliting train data  into validation and train data again therfore model wont see the test data in any cost

In [79]:
# ===================== Prepare Training & Test Data =====================

# Features (X) and target (y) for training data
X_train = train.drop(['quality'], axis=1).values
# -> Drops target column 'quality' so only features remain.
# -> .values converts DataFrame to NumPy array (shape: (n_samples, n_features)).

y_train = train[['quality']].values.ravel()
# -> Double brackets keep 'quality' as DataFrame (shape: (n_samples, 1)).
# -> .values converts it to NumPy array.
# -> .ravel() flattens it to 1D (shape: (n_samples,)), required by ML models.

# Features (X) and target (y) for test data
X_test = test.drop(['quality'], axis=1).values
y_test = test[['quality']].values.ravel()

# ===================== Train-Validation Split =====================

# Split the training data further into training and validation sets
X_train, valid_x, y_train, valid_y = train_test_split(
    X_train, y_train, test_size=0.20, random_state=42
)
# -> 20% of training data is used for validation
# -> random_state=42 ensures reproducibility

# ===================== MLflow Signature =====================

# Infer input-output signature for model logging in MLflow
signature = infer_signature(X_train, y_train)
# -> signature helps MLflow understand the expected input/output shapes & types


In [80]:
X_train

array([[ 5.8 ,  0.28,  0.35, ...,  3.28,  0.5 , 10.2 ],
       [ 7.1 ,  0.21,  0.28, ...,  3.35,  0.64, 10.2 ],
       [ 5.9 ,  0.32,  0.26, ...,  3.24,  0.36, 10.7 ],
       ...,
       [ 7.4 ,  0.2 ,  0.37, ...,  3.14,  0.61, 11.8 ],
       [ 8.1 ,  0.3 ,  0.31, ...,  2.99,  0.45, 11.1 ],
       [ 6.6 ,  0.36,  0.21, ...,  3.18,  0.41,  9.9 ]])

## we are keep the test data in hide mode so that we hust spliting train data  into validation and train data again therfore model wont see the test data in any cost

In [81]:
X_train.shape

(2938, 11)

In [82]:
np.mean(X_train,axis=0)


array([6.86621852e+00, 2.80377808e-01, 3.32597005e-01, 6.42164738e+00,
       4.55513955e-02, 3.53556841e+01, 1.38792376e+02, 9.94074221e-01,
       3.18919333e+00, 4.88396869e-01, 1.05005673e+01])

In [83]:
## ANN model
def train_model(params,epoch,X_train,y_train,valid_x,valid_y,X_test,y_test):
  #define model architecture
  mean=np.mean(X_train,axis=0)
  var=np.var(X_train,axis=0)
  model=keras.Sequential(
      [
          keras.Input([X_train.shape[1]]),
          keras.layers.Normalization(mean=mean,variance=var),
          keras.layers.Dense(64,activation='relu'),
          keras.layers.Dense(1)
      ]
  )

  #compile the model
  model.compile(optimizer=keras.optimizers.SGD(learning_rate=params['lr'],momentum=params['momentum']),
                loss="mean_squared_error",
                metrics=[keras.metrics.RootMeanSquaredError()])


  ##train the Ann the model with lr and momentum params with mlflow tracking
  with mlflow.start_run(nested=True):
    model.fit(X_train,y_train,epochs=epoch,validation_data=(valid_x,valid_y),batch_size=64)


    ##evalute the best model
    eval_result=model.evaluate(valid_x,valid_y,batch_size=64)

    eval_rmse=eval_result[1]


    ##log the parameter and result
    mlflow.log_params(params)
    mlflow.log_metric("eval_rmse",eval_rmse)

    # Infer the signature from a sample input tensor
    sample_input = X_train[np.random.choice(X_train.shape[0], 1)].astype(np.float32)
    signature = mlflow.models.signature.infer_signature(sample_input, model.predict(sample_input))

    #log the model
    mlflow.tensorflow.log_model(model,"model",signature=signature, input_example=sample_input)

    return {"loss":eval_rmse,"status":STATUS_OK,"model":model}

In [84]:
def objective(params):
  #mlflow will track the parameter and results for each run
  result=train_model(
      params,
      epoch=3,
      X_train=X_train,
      y_train=y_train,
      valid_x=valid_x,
      valid_y=valid_y,
      X_test=X_test,
      y_test=y_test
  )

  return result

In [85]:
space={
    'lr':hp.loguniform('lr',np.log(1e-5),np.log(1e-1)),
    'momentum':hp.uniform('momentum',0.0,1.0)
}

In [86]:
with mlflow.start_run():
  ##conduct the hyperparamter turning by using hyperopt
  trials=Trials()
  best=fmin(
      fn=objective,
      space=space,
      algo=tpe.suggest,
      max_evals=4,
      trials=trials
  )


  #fetch the details of best run
  best_run=sorted(trials.results,key=lambda x:x['loss'])[0]


  ##log the best paramters and best model
  mlflow.log_params(best)
  mlflow.log_metric("best_eval_rmse",best_run['loss']) # Changed metric name for clarity


  ##print the results

  print(f"Best paramters:{best}")
  print(f"best eval rmse:{best_run['loss']}")

InvalidUrlException: Invalid url: http:127.0.0.1//:5000/api/2.0/mlflow/runs/create

In [None]:
#mlflow.set_tracking_uri("http:127.0.0.1//:5000") # This line was commented out as it seemed to be causing an issue with the tracking URI.