In [1]:
# Imports

import os 
import pickle 
import numpy as np 
import pandas as pd 
from datetime import datetime 
from pathlib import Path 

from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor , GradientBoostingRegressor , HistGradientBoostingRegressor
from sklearn.linear_model import Ridge 
# from sklearn.pipeline import Pipeline 
from sklearn.metrics import mean_squared_error , mean_absolute_error,r2_score

import mlflow 
import mlflow.sklearn 




In [2]:
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception as e:
    HAS_XGB = False
    print(e)

No module named 'xgboost'


In [3]:
# Paths Configurations 

data_path = Path('data/flights_cleaned.npz')

artifacts_dir = Path('artifacts')

mlflow_experiment_name = 'flight_price_baseline'
random_state = 42 

mlflow.set_experiment(mlflow_experiment_name)



2025/12/12 23:14:16 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/12 23:14:16 INFO mlflow.store.db.utils: Updating database tables
2025/12/12 23:14:16 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/12 23:14:16 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/12 23:14:16 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/12 23:14:16 INFO alembic.runtime.migration: Will assume non-transactional DDL.


<Experiment: artifact_location='file:///d:/Deep Learning Projects/travel-mlops-capstone/mlruns/1', creation_time=1765450009537, experiment_id='1', last_update_time=1765450009537, lifecycle_stage='active', name='flight_price_baseline', tags={}>

In [4]:
# load data 

assert data_path.exists(),f"{data_path} not found"
npz = np.load(data_path , allow_pickle = True)
if "X" in npz.files and 'y' in npz.files:
    X = npz["X"]
    y = npz['y']

else:
    raise ValueError("Could not find X and y arrays inside flights_cleaned.npz")

print("X shape :", X.shape )
print("Y shape :",y.shape)




X shape : (271888, 28)
Y shape : (271888,)


In [5]:
# Train test split 

xtrain , xtest , ytrain , ytest = train_test_split(X , y , test_size = 0.1 , random_state= random_state)

print('Train shapes:', xtrain.shape, ytrain.shape)
print('Test shapes :', xtest.shape, ytest.shape)

Train shapes: (244699, 28) (244699,)
Test shapes : (27189, 28) (27189,)


In [6]:
import json 
from sklearn.base import clone

In [None]:
def evaluate_and_log(model_name , model , xtrain , ytrain , xtest , ytest , param_search = None , n_iter = 20):

    """
    Train(with optional RandomizedSearchCV) and log everything MLFLOW

    Returns : best_estimator , dict(metrics) 
    
   
    """
    run_name = f"{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
    with mlflow.start_run(run_name = run_name):
        mlflow.set_tag('model_type' , model_name)

        # If param_search provided ,run RandomizedSearchCV 

        if param_search:
            search = RandomizedSearchCV(
                estimator = clone(model),
                param_distributions=param_search,
                n_iter = min(n_iter , 40),
                cv = 3,
                scoring = 'neg_mean_squared_error',
                random_state = random_state,
                n_jobs = -1,
                verbose = 0
            )

            search.fit(xtrain , ytrain)
            best = search.best_estimator_
            best_params = search.best_params_
            mlflow.log_params({f"best_{k}": v for k , v in best_params.items()})

            # log CV results

            try:
                cvres = pd.DataFrame(search.cv_results_)
                cv_summary = cvres[['params','mean_test_score','std_test_score','rank_test_score']].to_dict(orient = 'records')

                mlflow.log_text(json.dumps(cv_summary , default = str),'cv_summary.json')

            except Exception as e:
                print(e)
                pass

        else:
            best = clone(model)
            best.fit(xtrain , ytrain)
            mlflow.log_params({'note':'no-hyperparam-search'})
        

        # predict and metrics

        preds = best.predict(xtest)
        rmse = mean_squared_error(ytest , preds , squared=False)
        mae = mean_absolute_error(ytest , preds) 
        r2 = r2_score(ytest , preds)

        mlflow.log_metric('rmse', float(rmse))
        mlflow.log_metric('mae', float(mae))
        mlflow.log_metric('r2', float(r2))

        # log the model via mlflow.sklearn

        mlflow.sklearn.log_model(best , artifact_path = 'model' )

        # save and log a pickle locally as well

        model_file = artifacts_dir/f'{model_name}_best.pkl'
        with open(model_file , 'wb') as f:
            pickle.dump(best , f)

        mlflow.log_artifact(str(model_file) , artifact_path='model_files')

        # if model has feature_importances_ , log top features 

        try:
            importances = getattr(best , 'feature_importances_',None)
            if importances is not None:
                imp_df = pd.DataFrame({'feature_idx':list(range(len(importances))),
                                       'importance' : importances
                                       })
                
                imp_csv = artifacts_dir/f'{model_name}_feature_importances.csv'
                imp_df.to_csv(imp_csv , index = False)

                mlflow.log_artifacts(str(imp_csv) , artifacts_path = 'feature_importances')

        except Exception :
            pass



        print(f"{model_name} done - RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

        return best , {'rmse':rmse , 'mae':mae  , 'r2':r2}
    
    

In [8]:
# Defining models and search spaces 

models_and_search = []

# 1) Random Forest 

rf = RandomForestRegressor(random_state = random_state)
rf_search = {
    'n_estimators' : [100 , 200 , 400],
    'max_depth' : [6, 10, 20 , None],
    'min_samples_leaf' : [1,2,4]
}

models_and_search.append(('random_forest' , rf , rf_search))

# 2) Gradient Boosting (sklearn)
gb = GradientBoostingRegressor(random_state=random_state)
gb_search = {
'n_estimators': [100, 200, 400],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 5, 8]
}
models_and_search.append(('grad_boost', gb, gb_search))


# 3) HistGradientBoosting (fast, sklearn >=0.21)
hgb = HistGradientBoostingRegressor(random_state=random_state)
hgb_search = {
'max_iter': [100, 200, 400],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 6, 12]
}
models_and_search.append(('hist_gb', hgb, hgb_search))

# 4) XGBoost (if available)
if HAS_XGB:
    xgb = XGBRegressor(objective='reg:squarederror', random_state=random_state, n_jobs=-1)
    xgb_search = {
    'n_estimators': [100, 200, 400],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 6, 10]
    }
    models_and_search.append(('xgboost', xgb, xgb_search))
else:
    print('XGBoost not found — skipping xgboost. To enable, install xgboost in the kernel')


# 5) Linear model (Ridge) as a simple baseline
ridge = Ridge(random_state=random_state)
ridge_search = {
'alpha': [0.1, 1.0, 10.0, 100.0]
}
models_and_search.append(('ridge', ridge, ridge_search))

XGBoost not found — skipping xgboost. To enable, install xgboost in the kernel


In [9]:
# Running training loop

results = {}

for name , model , search_space in models_and_search:
    print('\nStarting:',name)
    best_est , metrics = evaluate_and_log(
        model_name = name ,
        model = model , 
        xtrain=xtrain ,
        ytrain = ytrain,
        xtest = xtest , 
        ytest = ytest,
        param_search = search_space,
        n_iter = 20
    )
    results[name] = {'estimator':best_est , 'metrics':metrics}

# save summary of results

summary_df = pd.DataFrame([{
    'model':k,
    'rmse':v['metrics']['rmse'],
    'mae':v['metrics']['mae'],
    'r2':v['metrics']['r2']
} for k , v in results.items()])

summary_df = summary_df.sort_values('rmse')
summary_df.to_csv(artifacts_dir/'model_comparison.csv' , index = False)

mlflow.log_artifacts(str(artifacts_dir/'model_comparison.csv'))
print('\nAll models trained. Summary:')
print(summary_df)




Starting: random_forest




random_forest done - RMSE: 0.5282, MAE: 0.0409, R2: 1.0000

Starting: grad_boost




grad_boost done - RMSE: 0.2121, MAE: 0.1468, R2: 1.0000

Starting: hist_gb




hist_gb done - RMSE: 3.6278, MAE: 2.6443, R2: 0.9999

Starting: ridge




ridge done - RMSE: 103.0210, MAE: 80.7872, R2: 0.9189

All models trained. Summary:
           model        rmse        mae        r2
1     grad_boost    0.212119   0.146758  1.000000
0  random_forest    0.528176   0.040939  0.999998
2        hist_gb    3.627784   2.644270  0.999899
3          ridge  103.021000  80.787184  0.918940
