# Flight Fare Prediction

This notebook follows the implementation workflow from setup to validation and basic tests.

## 1) Set Up Environment and Imports
Import required Python libraries, define notebook-wide settings, and verify package versions.

In [30]:
from sys import path
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from src.config import RAW_DATA_PATH, PROCESSED_DATA_PATH, MODEL_COMPARISON_PATH
from src.data_preprocessing import load_and_preprocess
from src.eda import run_eda
from src.train import load_split_data, train_baseline_models
from src.tune import tune_models
from src.interpret import plot_actual_vs_predicted, plot_residuals

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", None)

print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)

pandas: 3.0.0
numpy: 2.4.2
sklearn: 1.8.0


## 2) Define Configuration and Constants
Create a configuration block for paths, runtime parameters, and default constants.

In [31]:
CONFIG = {
    "raw_data_path": str(RAW_DATA_PATH),
    "processed_data_path": str(PROCESSED_DATA_PATH),
    "comparison_path": str(MODEL_COMPARISON_PATH),
    "random_state": 42,
}

## 3) Implement Core Functions
Write first version of core helper functions and main processing logic with example calls.

1. Data Preview

In [32]:
def load_preview(path: str):
    raw_df = pd.read_csv(path)
    print("Shape:", raw_df.shape)
    print("\nFeatures and types:")
    display(raw_df.dtypes)
    display(raw_df.head())
    print("\nSummary statistics of numerical features:")
    display(raw_df.describe(include="number"))
    print("\nSummary statistics of categorical features:")
    display(raw_df.describe(include="str"))
    return raw_df

In [33]:
preview_df = load_preview(CONFIG["raw_data_path"])

Shape: (57000, 17)

Features and types:


Airline                      str
Source                       str
Source Name                  str
Destination                  str
Destination Name             str
Departure Date & Time        str
Arrival Date & Time          str
Duration (hrs)           float64
Stopovers                    str
Aircraft Type                str
Class                        str
Booking Source               str
Base Fare (BDT)          float64
Tax & Surcharge (BDT)    float64
Total Fare (BDT)         float64
Seasonality                  str
Days Before Departure      int64
dtype: object

Unnamed: 0,Airline,Source,Source Name,Destination,Destination Name,Departure Date & Time,Arrival Date & Time,Duration (hrs),Stopovers,Aircraft Type,Class,Booking Source,Base Fare (BDT),Tax & Surcharge (BDT),Total Fare (BDT),Seasonality,Days Before Departure
0,Malaysian Airlines,CXB,Cox's Bazar Airport,CCU,Netaji Subhas Chandra Bose International Airpo...,2025-11-17 06:25:00,2025-11-17 07:38:10,1.219526,Direct,Airbus A320,Economy,Online Website,21131.225021,5169.683753,26300.908775,Regular,10
1,Cathay Pacific,BZL,Barisal Airport,CGP,"Shah Amanat International Airport, Chittagong",2025-03-16 00:17:00,2025-03-16 00:53:31,0.608638,Direct,Airbus A320,First Class,Travel Agency,11605.395471,200.0,11805.395471,Regular,14
2,British Airways,ZYL,"Osmani International Airport, Sylhet",KUL,Kuala Lumpur International Airport,2025-12-13 12:03:00,2025-12-13 14:44:22,2.689651,1 Stop,Boeing 787,Economy,Travel Agency,39882.499349,11982.374902,51864.874251,Winter Holidays,83
3,Singapore Airlines,RJH,"Shah Makhdum Airport, Rajshahi",DAC,"Hazrat Shahjalal International Airport, Dhaka",2025-05-30 03:21:00,2025-05-30 04:02:09,0.686054,Direct,Airbus A320,Economy,Direct Booking,4435.60734,200.0,4635.60734,Regular,56
4,British Airways,SPD,Saidpur Airport,YYZ,Toronto Pearson International Airport,2025-04-25 09:14:00,2025-04-25 23:17:20,14.055609,1 Stop,Airbus A350,Business,Direct Booking,59243.806146,14886.570922,74130.377068,Regular,90



Summary statistics of numerical features:


Unnamed: 0,Duration (hrs),Base Fare (BDT),Tax & Surcharge (BDT),Total Fare (BDT),Days Before Departure
count,57000.0,57000.0,57000.0,57000.0,57000.0
mean,3.994955,58899.556573,11448.238494,71030.316199,45.460579
std,4.094043,68840.614499,12124.344329,81769.199536,26.015657
min,0.5,1600.975688,200.0,1800.975688,1.0
25%,1.003745,8856.316983,200.0,9602.699787,23.0
50%,2.644656,31615.996792,9450.940481,41307.54499,45.0
75%,5.490104,85722.930389,17513.04616,103800.906963,68.0
max,15.831719,449222.93377,73383.440066,558987.332444,90.0



Summary statistics of categorical features:


Unnamed: 0,Airline,Source,Source Name,Destination,Destination Name,Departure Date & Time,Arrival Date & Time,Stopovers,Aircraft Type,Class,Booking Source,Seasonality
count,57000,57000,57000,57000,57000,57000,57000,57000,57000,57000,57000,57000
unique,24,8,8,20,20,54126,56944,3,5,3,3,4
top,US-Bangla Airlines,CGP,"Shah Amanat International Airport, Chittagong",JED,"King Abdulaziz International Airport, Jeddah",2025-11-15 15:33:00,2025-08-06 03:24:03,Direct,Airbus A320,Economy,Direct Booking,Regular
freq,4496,7241,7241,3071,3071,4,2,36642,23970,19112,19111,44525


In [None]:
def run_core_workflow(path: str):
    bundle = load_and_preprocess(path=path, save_processed=True)
    eda_outputs = run_eda(bundle.df)

    X_train, X_test, y_train, y_test, cat_cols, num_cols = load_split_data(path)
    baseline_df, baseline_models, baseline_preds = train_baseline_models(
        X_train, X_test, y_train, y_test, cat_cols, num_cols
    )
    tuned_df, tuned_models = tune_models(
        X_train, X_test, y_train, y_test, cat_cols, num_cols
    )

    return {
        "bundle": bundle,
        "eda_outputs": eda_outputs,
        "baseline_df": baseline_df,
        "tuned_df": tuned_df,
        "X_test": X_test,
        "y_test": y_test,
        "baseline_models": baseline_models,
        "tuned_models": tuned_models,
        "baseline_preds": baseline_preds,
    }

workflow_result = run_core_workflow(CONFIG["raw_data_path"])


  model = cd_fast.sparse_enet_coordinate_descent(


In [None]:
workflow_result

## 4) Add Input Validation and Error Handling
Implement checks for invalid inputs, missing files, and boundary conditions.

In [None]:
from pathlib import Path


def validate_input_path(path: str):
    path_obj = Path(path)
    if not path_obj.exists():
        raise FileNotFoundError(f"Dataset not found at: {path}")
    if path_obj.suffix.lower() != ".csv":
        raise ValueError("Input dataset must be a .csv file")
    return path_obj


def validate_min_rows(df: pd.DataFrame, min_rows: int = 20):
    if df.shape[0] < min_rows:
        raise ValueError(f"Dataset has too few rows ({df.shape[0]}). Minimum required: {min_rows}")


# Example validation usage
# validated_path = validate_input_path(CONFIG["raw_data_path"])
# raw_df = pd.read_csv(validated_path)
# validate_min_rows(raw_df)


## 5) Run the Main Workflow
Assemble functions into an executable workflow and run on sample input.

In [None]:
# Execute only after placing dataset in data/raw/
try:
    validated = validate_input_path(CONFIG["raw_data_path"])
    raw_df = pd.read_csv(validated)
    validate_min_rows(raw_df)

    workflow_result = run_core_workflow(str(validated))
    display(workflow_result["baseline_df"])
    display(workflow_result["tuned_df"])

    best_baseline = workflow_result["baseline_df"].sort_values("rmse").iloc[0]
    print("Best baseline model:", best_baseline["model"])
except Exception as exc:
    print("Workflow run skipped/failed:", exc)


## 6) Add Basic Unit Tests
Create lightweight tests for success and failure cases and run them in-notebook.

In [None]:
def _assert_raises(func, expected_exception, *args, **kwargs):
    try:
        func(*args, **kwargs)
    except expected_exception:
        return True
    except Exception as wrong_exc:
        raise AssertionError(f"Expected {expected_exception}, got {type(wrong_exc)}") from wrong_exc
    raise AssertionError(f"Expected {expected_exception} but no exception was raised")


def run_basic_tests():
    # Failure case: bad extension
    assert _assert_raises(validate_input_path, ValueError, "not_a_csv.txt")

    # Success case: dataframe minimum rows check
    test_df = pd.DataFrame({"x": list(range(25))})
    validate_min_rows(test_df, min_rows=20)

    # Failure case: not enough rows
    small_df = pd.DataFrame({"x": [1, 2]})
    assert _assert_raises(validate_min_rows, ValueError, small_df, 20)

    print("Basic notebook tests passed")


run_basic_tests()
