In [60]:
import numpy as np
import pandas as pd
import mlflow
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score,mean_squared_error,root_mean_squared_error
from sklearn.model_selection import train_test_split,GridSearchCV
from mlflow.models import infer_signature
from urllib.parse import urlunparse
import mlflow.sklearn
from xgboost import XGBRegressor
housing = fetch_california_housing()


In [61]:
data = pd.DataFrame(housing.data , columns=housing.feature_names)
data['Price'] = housing.target

In [62]:
data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [63]:
X = data.drop(columns=['Price'])
y = data['Price']

In [64]:
## Splitting the data into train test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

#Signature
signature = infer_signature(X_train,y_train)

param_grid = {'n_estimators':[10,20,50],
              'max_depth':[5,10,15,None],
            }  

In [65]:
def hyperparamater_tuning(X,y,param_grid):
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
    grid_search.fit(X,y)
    return grid_search

In [66]:
mlflow.set_experiment('housing_rf')
mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')
with mlflow.start_run():
    #Perforn hyperparameter tuning
    grid_search = hyperparamater_tuning(X_train,y_train,param_grid)
    #Get the best model
    best_model=grid_search.best_estimator_
    #Evaluate the best model
    y_pred=best_model.predict(X_test)
    mse= mean_squared_error(y_test,y_pred)
    rmse=root_mean_squared_error(y_test,y_pred)

    # Log the best parameter
# Log the best parameter
    mlflow.log_param('best_n_estimator', best_model.get_params()['n_estimators'])
    mlflow.log_param('best_max_depth', best_model.get_params()['max_depth'])
    # Log metrics
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse ) 
    #logging the model  
    mlflow.sklearn.log_model(sk_model=best_model,artifact_path='model',signature=signature)
    #Printing the scores
    print(f'Best Hyperparameter: {grid_search.best_params_}')
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')

2025/04/19 12:33:22 INFO mlflow.tracking.fluent: Experiment with name 'housing_rf' does not exist. Creating a new experiment.


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Hyperparameter: {'max_depth': None, 'n_estimators': 50}
Mean Squared Error: 0.2526256170737198
Root Mean Squared Error: 0.5026187591741078


In [67]:
X = data.drop(columns=['Price'])
y = data['Price']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Infer model input/output signature
signature = infer_signature(X_train, y_train)

# Hyperparameter grid
xgb_param_grid = {
    'n_estimators': [10, 20, 50],
    'max_depth': [5, 10, 15, 100],  # Replace None with 100 (effectively unlimited)
}


In [68]:


def xgb_hyperparameter_tuning(X, y, param_grid):
    xgb = XGBRegressor(random_state=42, objective='reg:squarederror')
    grid_search = GridSearchCV(xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(X, y)
    return grid_search

In [69]:
#MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("housing_xgb")

with mlflow.start_run(run_name="xgboost_gridsearch"):
    grid_search = xgb_hyperparameter_tuning(X_train, y_train, xgb_param_grid)
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))


    # Log parameters
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("rmse", rmse)

    # Log model
    mlflow.xgboost.log_model(best_model, artifact_path="xgb_model", signature=signature)

    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"MSE: {mse}")
    print(f"RMSE: {rmse}")

2025/04/19 12:33:50 INFO mlflow.tracking.fluent: Experiment with name 'housing_xgb' does not exist. Creating a new experiment.


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Hyperparameters: {'max_depth': 5, 'n_estimators': 50}
MSE: 0.23214953137832134
RMSE: 0.4818189819614015
