## Load Dataset
Load the preprocessed dataset and display basic information about its structure.


In [15]:
import pandas as pd
import numpy as np

# Read the preprocessed dataset
df = pd.read_csv("data/preprocessed_dataset.csv")
print(f"Original dataset shape: {df.shape}")
print(f"Original columns: {list(df.columns)}")
df.head()


Original dataset shape: (130450, 13)
Original columns: ['searchDate', 'flightDate', 'totalFare', 'days_to_departure', 'search_dow', 'flight_dow', 'search_month', 'flight_month', 'search_week', 'flight_week', 'is_weekend_flight', 'days_into_summer', 'is_peak_travel_period']


Unnamed: 0,searchDate,flightDate,totalFare,days_to_departure,search_dow,flight_dow,search_month,flight_month,search_week,flight_week,is_weekend_flight,days_into_summer,is_peak_travel_period
0,2022-04-17,2022-06-01,272.6,45,6,2,4,6,15,22,0,0,0
1,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
2,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
3,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
4,2022-04-17,2022-06-01,328.6,45,6,2,4,6,15,22,0,0,0


## Filter Columns
Keep only the three core columns: searchDate, flightDate, and totalFare.


In [16]:
# Keep only the three core columns
columns_to_keep = ['searchDate', 'flightDate', 'totalFare']
df_filtered = df[columns_to_keep]
print(f"Filtered dataset shape: {df_filtered.shape}")
print(f"Filtered columns: {list(df_filtered.columns)}")
df_filtered.head()


Filtered dataset shape: (130450, 3)
Filtered columns: ['searchDate', 'flightDate', 'totalFare']


Unnamed: 0,searchDate,flightDate,totalFare
0,2022-04-17,2022-06-01,272.6
1,2022-04-17,2022-06-01,281.6
2,2022-04-17,2022-06-01,281.6
3,2022-04-17,2022-06-01,281.6
4,2022-04-17,2022-06-01,328.6


## Group and Aggregate
Group rows by searchDate and flightDate, then average the totalFare for each unique combination.


In [17]:
df_grouped = df_filtered.groupby(['searchDate', 'flightDate'])['totalFare'].mean().reset_index()
print(f"Grouped dataset shape: {df_grouped.shape}")
print(f"Reduced from {len(df_filtered)} rows to {len(df_grouped)} rows")
df_grouped.iloc[0:25]

Grouped dataset shape: (4599, 3)
Reduced from 130450 rows to 4599 rows


Unnamed: 0,searchDate,flightDate,totalFare
0,2022-04-17,2022-06-01,366.445238
1,2022-04-17,2022-06-02,383.9125
2,2022-04-17,2022-06-03,406.672917
3,2022-04-17,2022-06-04,424.955385
4,2022-04-17,2022-06-05,416.481111
5,2022-04-17,2022-06-06,398.401143
6,2022-04-17,2022-06-07,341.123462
7,2022-04-17,2022-06-08,362.764
8,2022-04-17,2022-06-09,403.721212
9,2022-04-17,2022-06-10,416.877097


## Calculate Days to Departure
Compute the number of days between the search date and flight date for each record.


In [18]:
# Calculate days between search date and flight date
df_grouped['searchDate'] = pd.to_datetime(df_grouped['searchDate'])
df_grouped['flightDate'] = pd.to_datetime(df_grouped['flightDate'])
df_grouped['days_to_departure'] = (df_grouped['flightDate'] - df_grouped['searchDate']).dt.days
df_grouped.iloc[0:25]

Unnamed: 0,searchDate,flightDate,totalFare,days_to_departure
0,2022-04-17,2022-06-01,366.445238,45
1,2022-04-17,2022-06-02,383.9125,46
2,2022-04-17,2022-06-03,406.672917,47
3,2022-04-17,2022-06-04,424.955385,48
4,2022-04-17,2022-06-05,416.481111,49
5,2022-04-17,2022-06-06,398.401143,50
6,2022-04-17,2022-06-07,341.123462,51
7,2022-04-17,2022-06-08,362.764,52
8,2022-04-17,2022-06-09,403.721212,53
9,2022-04-17,2022-06-10,416.877097,54


## Pivot Data
Reshape the data with flightDate_month_day as rows and days_to_departure as columns for multi-output regression.


In [19]:
# Format flightDate to only show month and day (no year)
df_grouped['flightDate_month_day'] = df_grouped['flightDate'].dt.strftime('%m-%d')

# Pivot the data: flightDate_month_day as rows, days_to_departure as columns
df_pivoted = df_grouped.pivot(index='flightDate_month_day', columns='days_to_departure', values='totalFare')
numeric_cols = sorted([int(col) for col in df_pivoted.columns], reverse=True)

# Create column mapping
column_mapping = {}
for days in numeric_cols:
    column_mapping[days] = f"{days} days to departure"

# Then rename columns
df_pivoted = df_pivoted.rename(columns=column_mapping)

df_pivoted.iloc[0:25]


days_to_departure,1 days to departure,2 days to departure,3 days to departure,4 days to departure,5 days to departure,6 days to departure,7 days to departure,8 days to departure,9 days to departure,10 days to departure,...,51 days to departure,52 days to departure,53 days to departure,54 days to departure,55 days to departure,56 days to departure,57 days to departure,58 days to departure,59 days to departure,60 days to departure
flightDate_month_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06-01,417.09087,397.650952,420.162353,470.177,452.476957,527.084737,493.403158,527.262609,495.3625,476.796667,...,,,,,,,,,,
06-02,415.924167,403.580455,395.881111,434.208667,445.159744,485.058333,476.95,500.29375,491.063333,434.014074,...,,,,,,,,,,
06-03,454.933889,425.447826,418.852917,431.945,454.743939,474.713043,480.011923,482.575,497.086897,494.309565,...,,,,,,,,,,
06-04,542.497143,461.461111,468.624,463.863913,451.170417,484.1,492.108,473.958929,477.646154,465.987273,...,,,,,,,,,,
06-05,879.026154,573.712174,667.654,634.514286,591.014643,623.223077,628.157895,628.789474,708.747059,647.606429,...,,,,,,,,,,
06-06,558.055714,550.206,530.795238,549.848462,556.404839,531.68871,508.586957,561.983333,572.066667,590.309091,...,,,,,,,,,,
06-07,599.839524,891.2448,1399.859091,799.3608,914.669643,563.844444,847.217308,595.122222,970.373077,852.691304,...,341.123462,,,,,,,,,
06-08,1507.860455,1520.783913,1378.587241,1250.355714,1721.851364,1427.6,1417.504,1051.26913,774.063182,878.011111,...,355.955172,362.764,,,,,,,,
06-09,1173.795625,1297.605789,1499.241579,1249.956538,1318.926667,1239.577778,1392.407692,1049.382273,676.377813,659.13,...,442.599677,,403.721212,,,,,,,
06-10,818.848333,1485.269167,1157.506667,1681.200667,1033.963478,2016.611111,1245.33,818.877778,707.129412,649.574,...,,426.145,,416.877097,,,,,,


## Add Date-Based Features

Additional date-based features are added to capture weekly and seasonal
patterns in flight pricing.

In [20]:
# Reconstruct flightDate from index and add date-based features
# Convert index (mm-dd format) to full date with year 2022
flightDate = pd.to_datetime('2022-' + df_pivoted.index)

# Create is_weekend: 1 if Saturday (5) or Sunday (6), 0 otherwise
# Note: flightDate is a DatetimeIndex, so use .dayofweek directly (not .dt.dayofweek)
is_weekend = flightDate.dayofweek.isin([5, 6]).astype(int)

# Create is_public_holiday: 1 if US 2022 federal holiday, 0 otherwise
# US 2022 holidays in data range (June 1 - August 31, 2022):
# Juneteenth (observed): 2022-06-20 (Monday)
# Independence Day: 2022-07-04 (Monday)
us_2022_holidays = pd.to_datetime(['2022-06-20', '2022-07-04'])
is_public_holiday = flightDate.isin(us_2022_holidays).astype(int)

# Create days_from_summer_start: days before (negative) or after (positive) June 21, 2022
# Note: Subtracting datetimes gives TimedeltaIndex, use .days directly (not .dt.days)
summer_start = pd.to_datetime('2022-06-21')
days_from_summer_start = (flightDate - summer_start).days

# Create day_of_month: day of the month (1-31)
# Note: flightDate is a DatetimeIndex, so use .day directly (not .dt.day)
day_of_month = flightDate.day

# Insert the new columns into df_pivoted
# Place them after the index but before the pivoted columns
df_pivoted.insert(0, 'is_weekend', is_weekend)
df_pivoted.insert(1, 'is_public_holiday', is_public_holiday)
df_pivoted.insert(2, 'days_from_summer_start', days_from_summer_start)
df_pivoted.insert(3, 'day_of_month', day_of_month)

# Display the updated dataframe to verify the new columns
print(f"Updated dataframe shape: {df_pivoted.shape}")
print(f"New columns: {list(df_pivoted.columns[:4])}")

df_pivoted.iloc[0:50]

Updated dataframe shape: (92, 64)
New columns: ['is_weekend', 'is_public_holiday', 'days_from_summer_start', 'day_of_month']


days_to_departure,is_weekend,is_public_holiday,days_from_summer_start,day_of_month,1 days to departure,2 days to departure,3 days to departure,4 days to departure,5 days to departure,6 days to departure,...,51 days to departure,52 days to departure,53 days to departure,54 days to departure,55 days to departure,56 days to departure,57 days to departure,58 days to departure,59 days to departure,60 days to departure
flightDate_month_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06-01,0,0,-20,1,417.09087,397.650952,420.162353,470.177,452.476957,527.084737,...,,,,,,,,,,
06-02,0,0,-19,2,415.924167,403.580455,395.881111,434.208667,445.159744,485.058333,...,,,,,,,,,,
06-03,0,0,-18,3,454.933889,425.447826,418.852917,431.945,454.743939,474.713043,...,,,,,,,,,,
06-04,1,0,-17,4,542.497143,461.461111,468.624,463.863913,451.170417,484.1,...,,,,,,,,,,
06-05,1,0,-16,5,879.026154,573.712174,667.654,634.514286,591.014643,623.223077,...,,,,,,,,,,
06-06,0,0,-15,6,558.055714,550.206,530.795238,549.848462,556.404839,531.68871,...,,,,,,,,,,
06-07,0,0,-14,7,599.839524,891.2448,1399.859091,799.3608,914.669643,563.844444,...,341.123462,,,,,,,,,
06-08,0,0,-13,8,1507.860455,1520.783913,1378.587241,1250.355714,1721.851364,1427.6,...,355.955172,362.764,,,,,,,,
06-09,0,0,-12,9,1173.795625,1297.605789,1499.241579,1249.956538,1318.926667,1239.577778,...,442.599677,,403.721212,,,,,,,
06-10,0,0,-11,10,818.848333,1485.269167,1157.506667,1681.200667,1033.963478,2016.611111,...,,426.145,,416.877097,,,,,,


## Handle Missing Values in Fare Curves

In [21]:
# Copy the pivoted dataframe
df_clean = df_pivoted.copy()

# Determine target columns (the days-to-departure columns)
target_cols = [c for c in df_clean.columns if "days to departure" in c]

# Interpolate missing values across each row (days to departure)
df_clean[target_cols] = df_clean[target_cols].interpolate(axis=1)

# Fill any remaining missing values with column medians
df_clean[target_cols] = df_clean[target_cols].fillna(df_clean[target_cols].median())

## Add Weekly and Monthly Features

Two additional calendar-based features are added to capture weekly and monthly
patterns in flight pricing. These features are grouped with the other input
variables before the fare-curve target columns.

In [22]:
# Add weekly and monthly features next to other input features
flightDate = pd.to_datetime("2022-" + df_clean.index)

df_clean.insert(4, "flight_day_of_week", flightDate.dayofweek)
df_clean.insert(5, "flight_month", flightDate.month)

## Normalize Continuous Features Only

Continuous input features are scaled to the [0, 1] range to ensure comparable
magnitudes during training. Binary indicator features such as `is_weekend` and
`is_public_holiday` are left unchanged, since they already take values in {0, 1}.

In [23]:
from sklearn.preprocessing import MinMaxScaler

# Binary features that are not normalized
binary_features = [
    "is_weekend",
    "is_public_holiday",
]

# Continuous features (normalize to [0, 1])
continuous_features = [
    "day_of_month",
    "days_from_summer_start",
    "flight_day_of_week",
    "flight_month",
]

scaler = MinMaxScaler()
df_clean[continuous_features] = scaler.fit_transform(df_clean[continuous_features])

## Split Dataset
Split the dataset into train (70%), validation (15%), and test (15%) sets using random sampling.


In [24]:
df_for_split = df_clean

# Split dataset into train, validation, and test sets
from sklearn.model_selection import train_test_split

# First split: 70% train, 30% temp
df_train, df_temp = train_test_split(
    df_for_split,     # use final cleaned dataframe
    test_size=0.3,
    random_state=42
)

# Second split: temp -> 15% val, 15% test
df_val, df_test = train_test_split(
    df_temp,
    test_size=0.5,
    random_state=42
)

# Display shapes
print(f"Train set shape: {df_train.shape}")
print(f"Validation set shape: {df_val.shape}")
print(f"Test set shape: {df_test.shape}")

print("\nSplit percentages:")
print(f"Train: {len(df_train) / len(df_for_split) * 100:.1f}% ({len(df_train)} rows)")
print(f"Validation: {len(df_val) / len(df_for_split) * 100:.1f}% ({len(df_val)} rows)")
print(f"Test: {len(df_test) / len(df_for_split) * 100:.1f}% ({len(df_test)} rows)")


Train set shape: (64, 66)
Validation set shape: (14, 66)
Test set shape: (14, 66)

Split percentages:
Train: 69.6% (64 rows)
Validation: 15.2% (14 rows)
Test: 15.2% (14 rows)


## Save Splits
Export the train, validation, and test splits to CSV files in the data/splits folder.


In [25]:
# Save splits
df_train.to_csv('data/splits/train.csv', index=True)
df_val.to_csv('data/splits/val.csv', index=True)
df_test.to_csv('data/splits/test.csv', index=True)