# Flight Fare Prediction - Implementation Notebook

This notebook follows the implementation workflow from setup to validation and basic tests.

## 1) Set Up Environment and Imports
Import required Python libraries, define notebook-wide settings, and verify package versions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from src.config import RAW_DATA_PATH, PROCESSED_DATA_PATH, MODEL_COMPARISON_PATH
from src.data_preprocessing import load_and_preprocess
from src.eda import run_eda
from src.train import load_split_data, train_baseline_models
from src.tune import tune_models
from src.interpret import plot_actual_vs_predicted, plot_residuals

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", None)

print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)

## 2) Define Configuration and Constants
Create a configuration block for paths, runtime parameters, and default constants.

In [None]:
CONFIG = {
    "raw_data_path": str(RAW_DATA_PATH),
    "processed_data_path": str(PROCESSED_DATA_PATH),
    "comparison_path": str(MODEL_COMPARISON_PATH),
    "random_state": 42,
}

CONFIG

## 3) Implement Core Functions
Write first version of core helper functions and main processing logic with example calls.

In [None]:
def load_preview(path: str):
    raw_df = pd.read_csv(path)
    print("Shape:", raw_df.shape)
    display(raw_df.head())
    display(raw_df.describe(include="all"))
    return raw_df


def run_core_workflow(path: str):
    bundle = load_and_preprocess(path=path, save_processed=True)
    eda_outputs = run_eda(bundle.df)

    X_train, X_test, y_train, y_test, cat_cols, num_cols = load_split_data(path)
    baseline_df, baseline_models, baseline_preds = train_baseline_models(
        X_train, X_test, y_train, y_test, cat_cols, num_cols
    )
    tuned_df, tuned_models = tune_models(
        X_train, X_test, y_train, y_test, cat_cols, num_cols
    )

    return {
        "bundle": bundle,
        "eda_outputs": eda_outputs,
        "baseline_df": baseline_df,
        "tuned_df": tuned_df,
        "X_test": X_test,
        "y_test": y_test,
        "baseline_models": baseline_models,
        "tuned_models": tuned_models,
        "baseline_preds": baseline_preds,
    }

# Example call (uncomment after dataset is available)
# preview_df = load_preview(CONFIG["raw_data_path"])
# workflow_result = run_core_workflow(CONFIG["raw_data_path"])


## 4) Add Input Validation and Error Handling
Implement checks for invalid inputs, missing files, and boundary conditions.

In [None]:
from pathlib import Path


def validate_input_path(path: str):
    path_obj = Path(path)
    if not path_obj.exists():
        raise FileNotFoundError(f"Dataset not found at: {path}")
    if path_obj.suffix.lower() != ".csv":
        raise ValueError("Input dataset must be a .csv file")
    return path_obj


def validate_min_rows(df: pd.DataFrame, min_rows: int = 20):
    if df.shape[0] < min_rows:
        raise ValueError(f"Dataset has too few rows ({df.shape[0]}). Minimum required: {min_rows}")


# Example validation usage
# validated_path = validate_input_path(CONFIG["raw_data_path"])
# raw_df = pd.read_csv(validated_path)
# validate_min_rows(raw_df)


## 5) Run the Main Workflow
Assemble functions into an executable workflow and run on sample input.

In [None]:
# Execute only after placing dataset in data/raw/
try:
    validated = validate_input_path(CONFIG["raw_data_path"])
    raw_df = pd.read_csv(validated)
    validate_min_rows(raw_df)

    workflow_result = run_core_workflow(str(validated))
    display(workflow_result["baseline_df"])
    display(workflow_result["tuned_df"])

    best_baseline = workflow_result["baseline_df"].sort_values("rmse").iloc[0]
    print("Best baseline model:", best_baseline["model"])
except Exception as exc:
    print("Workflow run skipped/failed:", exc)


## 6) Add Basic Unit Tests
Create lightweight tests for success and failure cases and run them in-notebook.

In [None]:
def _assert_raises(func, expected_exception, *args, **kwargs):
    try:
        func(*args, **kwargs)
    except expected_exception:
        return True
    except Exception as wrong_exc:
        raise AssertionError(f"Expected {expected_exception}, got {type(wrong_exc)}") from wrong_exc
    raise AssertionError(f"Expected {expected_exception} but no exception was raised")


def run_basic_tests():
    # Failure case: bad extension
    assert _assert_raises(validate_input_path, ValueError, "not_a_csv.txt")

    # Success case: dataframe minimum rows check
    test_df = pd.DataFrame({"x": list(range(25))})
    validate_min_rows(test_df, min_rows=20)

    # Failure case: not enough rows
    small_df = pd.DataFrame({"x": [1, 2]})
    assert _assert_raises(validate_min_rows, ValueError, small_df, 20)

    print("Basic notebook tests passed")


run_basic_tests()
