In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn
from sklearn.datasets import load_diabetes

## Data preperation 

In [2]:
diabetes=load_diabetes()
print(diabetes)

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]], shape=(442, 10)), 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59

In [3]:
df=pd.DataFrame(diabetes.data,columns=diabetes.feature_names)
df['bmi']=diabetes.target

In [4]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,151.0,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,75.0,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,141.0,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,206.0,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,135.0,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,178.0,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,104.0,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,132.0,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,220.0,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [5]:
## independent and dependent feautres

X=df.drop(labels=['bmi'],axis=1)
y=df['bmi']

In [6]:
def hyperparameter_tuning(X_train,y_train,param_grid):
    rf=RandomForestRegressor()
    grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=1,scoring="neg_mean_squared_error")
    grid_search.fit(X_train,y_train)
    return grid_search

In [7]:
## train test split 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

# Infersignatue 
from mlflow.models import infer_signature
signature=infer_signature(X_train,y_train)


##params 
param_grid={
    "n_estimators":[100,200],"min_samples_split":[5,10,None],"min_samples_leaf":[2,5],"max_depth":[5,10,None]
}




In [8]:
#implementing mlflow 

with mlflow.start_run():

    grid_search=hyperparameter_tuning(X_train,y_train,param_grid)

    ##defining our best model
    best_model=grid_search.best_estimator_

    ##Evaluating our best model 
    y_pred=best_model.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)

    mlflow.log_param("best_n_estimators",grid_search.best_params_['n_estimators'])
    mlflow.log_param("bestmin_samples_split",grid_search.best_params_["min_samples_split"])
    mlflow.log_param("best_mean_sample_leaf",grid_search.best_params_["min_samples_leaf"])
    mlflow.log_param("best_max_depth",grid_search.best_params_["max_depth"])
    mlflow.log_metric("mse",mse)


    ##mlflow tracking uri
    from urllib.parse import urlparse

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    tracking_get_uri=urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_get_uri != "file":
        mlflow.sklearn.log_model(best_model,"model",registered_model_name=" best reression model")
    else:
        mlflow.sklearn.log_model(best_model,signature=signature)

        print(f"best hyperparameters:{grid_search.best_params_}")
        print(f"mean_squared_error:{mse}")
    

Fitting 3 folds for each of 36 candidates, totalling 108 fits


36 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "e:\MLFLOW STARTER\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "e:\MLFLOW STARTER\venv\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "e:\MLFLOW STARTER\venv\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "e:\MLFLOW STARTER\venv\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(


üèÉ View run glamorous-toad-590 at: http://127.0.0.1:5000/#/experiments/0/runs/ad8ab1ac54914a43adf321c5effc78f5
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/0
