In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('MLflow BikeSharing')

# Load the dataset
hourData = pd.read_csv('bike+sharing+dataset/hour.csv')




In [2]:
def create_pipeline(estimator):
    numerical_features = ['temp', 'atemp', 'hum', 'windspeed']
    categorical_features = ['season', 'mnth', 'hr', 'weekday', 'weathersit']

    numeric_transformer = Pipeline([
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', estimator)
    ])

    return pipeline


In [3]:
X = hourData.drop(['instant', 'dteday', 'casual', 'registered', 'cnt'], axis=1)
y = hourData['cnt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
def evaluate_model(estimator, name, params=None):
    pipeline = create_pipeline(estimator)

    if params:
        pipeline = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='neg_root_mean_squared_error')

    with mlflow.start_run(run_name=name):
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        mlflow.log_param("model", name)
        if params and hasattr(pipeline, "best_params_"):
            mlflow.log_params(pipeline.best_params_)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.sklearn.log_model(pipeline, "model")

        print(f"{name} -- RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}")


In [None]:
# Linear Regression
evaluate_model(LinearRegression(), "LinearRegression")

# Random Forest with GridSearch
rf_params = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [10, 20]
}
evaluate_model(RandomForestRegressor(random_state=42), "RandomForest", rf_params)

# XGBoost with GridSearch
xgb_params = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__max_depth': [3, 5]
}
evaluate_model(XGBRegressor(objective='reg:squarederror', random_state=42), "XGBoost", xgb_params)
