In [None]:
import math
import numpy as np
import optuna
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor as rfr
import mlflow
import pickle
import os
import dvc.api
import dagshub

mlflow.set_tracking_uri("http://127.0.0.1:5050")

def get_or_create_experiment(experiment_name):
    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)

experiment_id = get_or_create_experiment("Scooter Rental Users Prediction")

train_dvc_path = 'data/train_set.csv.dvc'
test_dvc_path = 'data/test_set.csv.dvc'
preprocess_dvc_path = 'data/preprocess.pkl.dvc'

# Pull DVC data files
dvc.api.pull(train_dvc_path)
dvc.api.pull(test_dvc_path)
dvc.api.pull(preprocess_dvc_path)

# Load DVC data
train = pd.read_csv('data/train_set.csv')
test = pd.read_csv('data/test_set.csv')

# ... (rest of your code)

import pickle 

with open(preprocess_dvc_path, 'rb') as f:
    pre = pickle.load(f)

train_x = train.drop('log_total_users', axis=1)
train_y = train.log_total_users
valid_x = test.drop('log_total_users', axis=1)
valid_y = test.log_total_users

train_x_tf = pre.transform(train_x)
valid_x_tf = pre.transform(valid_x)

dtrain = xgb.DMatrix(train_x_tf, label=train_y)
dvalid = xgb.DMatrix(valid_x_tf, label=valid_y)

optuna.logging.set_verbosity(optuna.logging.ERROR)

def champion_callback(study, frozen_trial):
    winner = study.user_attrs.get("winner", None)

    if study.best_value and winner != study.best_value:
        study.set_user_attr("winner", study.best_value)
        if winner:
            improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
            print(
                f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
                f"{improvement_percent: .4f}% improvement"
            )
        else:
            print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

def xg_objective(trial):
    with mlflow.start_run(nested=True):
        # Define hyperparameters
        params = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse", 
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"])
        }

        if params["booster"] == "gbtree" or params["booster"] == "dart":
            params["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            params["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            params["max_features"] = trial.suggest_categorical("max_features", ["sqrt", "log2"])
            params["max_delta_step"] = trial.suggest_int("max_delta_step", 1, 10, log=True)

        # Train XGBoost model
        bst = xgb.train(params, dtrain)
        preds = bst.predict(dvalid)
        error = mean_squared_error(valid_y, preds)
        perror = mean_absolute_percentage_error(valid_y, preds)

        # Log to MLflow
        mlflow.log_params(params)
        mlflow.log_metric("mse", error)
        mlflow.log_metric("rmse", math.sqrt(error))
        mlflow.log_metric("mape", perror)

    return error

def rf_objective(trial):
    with mlflow.start_run(nested=True):
        # Define hyperparameters
        params1 = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 250, 50),
            "max_depth": trial.suggest_int("max_depth", 1, 15),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        }

        # Train RF model
        model1 = rfr(**params1)
        model1.fit(train_x_tf, train_y)
        preds1 = model1.predict(valid_x_tf)
        error1 = mean_squared_error(valid_y, preds1)
        perror1 = mean_absolute_percentage_error(valid_y, preds1)

        # Log to MLflow
        mlflow.log_params(params1)
        mlflow.log_metric("mse", error1)
        mlflow.log_metric("rmse", math.sqrt(error1))
        mlflow.log_metric("mape", perror1)

    return error1

# Log in to Dagshub (replace 'username/repo' with your Dagshub repository)
dagshub.log_in('username/repo')

# XGBoost Run
xg_run_name = "xgboost_attempt"
with mlflow.start_run(experiment_id=experiment_id, run_name=xg_run_name, nested=True):
    study = optuna.create_study(direction="minimize")
    study.optimize(xg_objective, n_trials=500, callbacks=[champion_callback])
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_mse", study.best_value)
    mlflow.log_metric("best_rmse", math.sqrt(study.best_value))
    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Scooter Rental Users Project",
            "optimizer_engine": "optuna",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )
    # Log a fit model instance
    model = xgb.train(study.best_params, dtrain)
    artifact_path = "model"
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path=artifact_path,
        input_example=train_x.iloc[[0]],
        model_format="ubj",
        metadata={"model_data_version": 1},
    )

# RandomForest Run
rf_run_name = "RandomForest_attemp"
with mlflow.start_run(experiment_id=experiment_id, run_name=rf_run_name, nested=True):
    study = optuna.create_study(direction="minimize")
    study.optimize(rf_objective, n_trials=500, callbacks=[champion_callback])
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_mse", study.best_value)
    mlflow.log_metric("best_rmse", math.sqrt(study.best_value))
    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Scooter Rental Users Project",
            "optimizer_engine": "optuna",
            "model_family": "RandomForest",
            "feature_set_version": 1,
        }
    )
    # Train RandomForest model
    model1 = rfr(**study.best_params)
    model1.fit(train_x_tf, train_y)
    # Log RandomForest model
    artifact_path = "model1"
    mlflow.sklearn.log_model(
        sk_model=model1,
        artifact_path=artifact_path,
        input_example=train_x.iloc[[0]],
        registered_model_name="RandomForest_Model",
    )

# Push changes to DVC and Dagshub
dvc.api.push()

# Log out from Dagshub
dagshub.log_out()