In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import optuna
import mlflow
import mlflow.sklearn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Flight_Price_Prediction")

2024/08/21 17:42:42 INFO mlflow.tracking.fluent: Experiment with name 'Flight_Price_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1724262162805, experiment_id='1', last_update_time=1724262162805, lifecycle_stage='active', name='Flight_Price_Prediction', tags={}>

In [3]:


# Set up MLflow tracking
mlflow.set_experiment("Flight Price Prediction - Random Forest Optuna Optimization")

# Load and preprocess the data
df = pd.read_csv("clean_dataset.csv")
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('flight', axis=1)

# Convert categorical columns
df['class'] = df['class'].apply(lambda x: 1 if x == 'Business' else 0)
df['class'].value_counts()
df.stops = pd.factorize(df.stops)[0]
df = df.join(pd.get_dummies(df.airline, prefix='airline')).drop('airline', axis=1)
df = df.join(pd.get_dummies(df.source_city, prefix='source')).drop('source_city', axis=1)
df = df.join(pd.get_dummies(df.destination_city, prefix='dest')).drop('destination_city', axis=1)
df = df.join(pd.get_dummies(df.arrival_time, prefix='arrival')).drop('arrival_time', axis=1)
df = df.join(pd.get_dummies(df.departure_time, prefix='departure')).drop('departure_time', axis=1)

# Prepare the data for training
X, y = df.drop('price', axis=1), df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Optuna objective function
def objective(trial):
    with mlflow.start_run():
        # Hyperparameter search space
        n_estimators = trial.suggest_int('n_estimators', 100, 500)
        max_depth = trial.suggest_int('max_depth', 10, 50)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

        # Create RandomForest model with trial parameters
        reg = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            random_state=42,
            n_jobs=-1
        )

        # Train the model
        reg.fit(X_train, y_train)

        # Predict and calculate the mean squared error
        y_pred = reg.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        error = np.sqrt(mse)

        # Log the parameters and metrics in MLflow
        mlflow.log_params({
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "min_samples_split": min_samples_split,
            "min_samples_leaf": min_samples_leaf,
            "max_features": max_features
        })
        mlflow.log_metric("rmse", error)
        
        # Autolog the model (automatically tracks model artifacts, hyperparameters, etc.)
        mlflow.sklearn.log_model(reg, "random_forest_model")

        return error

# Create the Optuna study and run the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)  # Try 10 trials to optimize

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)
print("Best MSE: ", study.best_value)

# Train the final model using the best hyperparameters
best_params = study.best_params
best_model = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42,
    n_jobs=-1
)
best_model.fit(X_train, y_train)

# Log the final model in MLflow
mlflow.sklearn.log_model(best_model, "best_random_forest_model")


2024/08/21 17:42:50 INFO mlflow.tracking.fluent: Experiment with name 'Flight Price Prediction - Random Forest Optuna Optimization' does not exist. Creating a new experiment.
[I 2024-08-21 17:42:51,297] A new study created in memory with name: no-name-3317ccec-aed9-4d1a-ac4f-25abebb2094f
2024/08/21 17:43:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run upbeat-sow-512 at: http://127.0.0.1:5000/#/experiments/2/runs/2bf8fbd567e14e05b0bea9d810a51cbb.
2024/08/21 17:43:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.
[I 2024-08-21 17:43:32,720] Trial 0 finished with value: 3656.7627447861805 and parameters: {'n_estimators': 194, 'max_depth': 14, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': None}. Best is trial 0 with value: 3656.7627447861805.
2024/08/21 17:45:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run nebulous-dolphin-371 at: http://127.0.0.1:5000/#/experiments/2/runs/7ba157d218794

Best hyperparameters:  {'n_estimators': 423, 'max_depth': 34, 'min_samples_split': 12, 'min_samples_leaf': 1, 'max_features': None}
Best MSE:  2667.7532784988825




<mlflow.models.model.ModelInfo at 0x76f8be52f710>

In [5]:
import mlflow

import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression , Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [8]:
def objective(params):
    """
    Trains an XGBoost model with specified parameters and logs metrics using MLflow.

    :param params: Dictionary of hyperparameters for the XGBoost model.
    :return: Dictionary containing the loss ('rmse') and the status of the optimization.
    """
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        mlflow.xgboost.autolog()
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [10]:
mlflow.end_run()

2024/08/21 18:00:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run funny-grouse-73 at: http://127.0.0.1:5000/#/experiments/2/runs/dab5cc6376cf45a5bd1b63562c4a909c.
2024/08/21 18:00:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2.
