Preprocessing v3

This version tries out improved preprocessing steps:
- better handling of missing price values
- normalizing the date features
- saving new train/val/test splits so we can compare results with v2


In [81]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

BASE_DIR = "aml_project"
DATA_DIR = os.path.join(BASE_DIR, "data")

In [82]:
# Load the same preprocessed dataset used in preprocessing_v2
df = pd.read_csv("aml_project/data/preprocessed_dataset.csv")

print("Original shape:", df.shape)
df.head()

Original shape: (130450, 13)


Unnamed: 0,searchDate,flightDate,totalFare,days_to_departure,search_dow,flight_dow,search_month,flight_month,search_week,flight_week,is_weekend_flight,days_into_summer,is_peak_travel_period
0,2022-04-17,2022-06-01,272.6,45,6,2,4,6,15,22,0,0,0
1,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
2,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
3,2022-04-17,2022-06-01,281.6,45,6,2,4,6,15,22,0,0,0
4,2022-04-17,2022-06-01,328.6,45,6,2,4,6,15,22,0,0,0


In [83]:
# Keep only the columns we need for grouping and later processing
df = df[['searchDate', 'flightDate', 'totalFare']].copy()

print("Filtered shape:", df.shape)
df.head()

Filtered shape: (130450, 3)


Unnamed: 0,searchDate,flightDate,totalFare
0,2022-04-17,2022-06-01,272.6
1,2022-04-17,2022-06-01,281.6
2,2022-04-17,2022-06-01,281.6
3,2022-04-17,2022-06-01,281.6
4,2022-04-17,2022-06-01,328.6


In [84]:
# Average totalFare for each (searchDate, flightDate) pair
df_grouped = df.groupby(['searchDate', 'flightDate'])['totalFare'].mean().reset_index()

print("Grouped shape:", df_grouped.shape)
df_grouped.head()

Grouped shape: (4599, 3)


Unnamed: 0,searchDate,flightDate,totalFare
0,2022-04-17,2022-06-01,366.445238
1,2022-04-17,2022-06-02,383.9125
2,2022-04-17,2022-06-03,406.672917
3,2022-04-17,2022-06-04,424.955385
4,2022-04-17,2022-06-05,416.481111


In [85]:
# Convert to datetime and compute days_to_departure
df_grouped['searchDate'] = pd.to_datetime(df_grouped['searchDate'])
df_grouped['flightDate'] = pd.to_datetime(df_grouped['flightDate'])
df_grouped['days_to_departure'] = (df_grouped['flightDate'] - df_grouped['searchDate']).dt.days

df_grouped.head()

Unnamed: 0,searchDate,flightDate,totalFare,days_to_departure
0,2022-04-17,2022-06-01,366.445238,45
1,2022-04-17,2022-06-02,383.9125,46
2,2022-04-17,2022-06-03,406.672917,47
3,2022-04-17,2022-06-04,424.955385,48
4,2022-04-17,2022-06-05,416.481111,49


In [86]:
# Use month-day as index
df_grouped['flightDate_month_day'] = df_grouped['flightDate'].dt.strftime('%m-%d')

# Pivot to get one row per flight date and one column per days_to_departure
df_pivoted = df_grouped.pivot(
    index='flightDate_month_day',
    columns='days_to_departure',
    values='totalFare'
)

# Rename columns like "1 days to departure"
numeric_cols = sorted([int(c) for c in df_pivoted.columns], reverse=True)
df_pivoted = df_pivoted.rename(columns={d: f"{d} days to departure" for d in numeric_cols})

print("Pivoted shape:", df_pivoted.shape)
df_pivoted.head()

Pivoted shape: (92, 60)


days_to_departure,1 days to departure,2 days to departure,3 days to departure,4 days to departure,5 days to departure,6 days to departure,7 days to departure,8 days to departure,9 days to departure,10 days to departure,...,51 days to departure,52 days to departure,53 days to departure,54 days to departure,55 days to departure,56 days to departure,57 days to departure,58 days to departure,59 days to departure,60 days to departure
flightDate_month_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06-01,417.09087,397.650952,420.162353,470.177,452.476957,527.084737,493.403158,527.262609,495.3625,476.796667,...,,,,,,,,,,
06-02,415.924167,403.580455,395.881111,434.208667,445.159744,485.058333,476.95,500.29375,491.063333,434.014074,...,,,,,,,,,,
06-03,454.933889,425.447826,418.852917,431.945,454.743939,474.713043,480.011923,482.575,497.086897,494.309565,...,,,,,,,,,,
06-04,542.497143,461.461111,468.624,463.863913,451.170417,484.1,492.108,473.958929,477.646154,465.987273,...,,,,,,,,,,
06-05,879.026154,573.712174,667.654,634.514286,591.014643,623.223077,628.157895,628.789474,708.747059,647.606429,...,,,,,,,,,,


In [87]:
# Rebuild dates to compute simple date features
flightDate = pd.to_datetime('2022-' + df_pivoted.index)

is_weekend = flightDate.dayofweek.isin([5, 6]).astype(int)

us_2022_holidays = pd.to_datetime(['2022-06-20', '2022-07-04'])
is_public_holiday = flightDate.isin(us_2022_holidays).astype(int)

summer_start = pd.to_datetime('2022-06-21')
days_from_summer_start = (flightDate - summer_start).days

day_of_month = flightDate.day

# Add features to dataframe
df_pivoted.insert(0, 'is_weekend', is_weekend)
df_pivoted.insert(1, 'is_public_holiday', is_public_holiday)
df_pivoted.insert(2, 'days_from_summer_start', days_from_summer_start)
df_pivoted.insert(3, 'day_of_month', day_of_month)

print("Added date features:", df_pivoted.shape)
df_pivoted.head()

Added date features: (92, 64)


days_to_departure,is_weekend,is_public_holiday,days_from_summer_start,day_of_month,1 days to departure,2 days to departure,3 days to departure,4 days to departure,5 days to departure,6 days to departure,...,51 days to departure,52 days to departure,53 days to departure,54 days to departure,55 days to departure,56 days to departure,57 days to departure,58 days to departure,59 days to departure,60 days to departure
flightDate_month_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06-01,0,0,-20,1,417.09087,397.650952,420.162353,470.177,452.476957,527.084737,...,,,,,,,,,,
06-02,0,0,-19,2,415.924167,403.580455,395.881111,434.208667,445.159744,485.058333,...,,,,,,,,,,
06-03,0,0,-18,3,454.933889,425.447826,418.852917,431.945,454.743939,474.713043,...,,,,,,,,,,
06-04,1,0,-17,4,542.497143,461.461111,468.624,463.863913,451.170417,484.1,...,,,,,,,,,,
06-05,1,0,-16,5,879.026154,573.712174,667.654,634.514286,591.014643,623.223077,...,,,,,,,,,,


In [88]:
# Count missing price values before better handling
price_cols = [c for c in df_pivoted.columns if 'days to departure' in c]
initial_nans = df_pivoted[price_cols].isna().sum().sum()
print("NaNs before interpolation:", initial_nans)

NaNs before interpolation: 921


In [89]:
# =========================================================
# SELECT WHICH FILLING METHOD TO APPLY
# =========================================================

# Options:
# "v3_interpolation" – linear smoothing
# "v3_ffill" – forward/backward fill
# (more experiments can be added later)

PREPROCESS_VERSION = "v3_ffill"

In [90]:
# =========================================================
# APPLY PRICE CLEANING BASED ON VERSION
# =========================================================

if PREPROCESS_VERSION == "v3_interpolation":
    print("Applying linear interpolation...")
    df_pivoted[price_cols] = (
        df_pivoted[price_cols]
        .T
        .interpolate(method='linear', limit_direction='both')
        .T
    )

elif PREPROCESS_VERSION == "v3_ffill":
    print("Applying forward/backward fill...")
    df_pivoted[price_cols] = df_pivoted[price_cols].ffill(axis=1).bfill(axis=1)

else:
    raise ValueError("Unknown PREPROCESS_VERSION. Choose one of the supported versions.")

Applying forward/backward fill...


In [91]:
# Normalize date features to the range [0, 1]
feature_cols = ['day_of_month', 'is_weekend', 'is_public_holiday', 'days_from_summer_start']

scaler = MinMaxScaler()
df_pivoted[feature_cols] = scaler.fit_transform(df_pivoted[feature_cols])

df_pivoted[feature_cols].describe()

days_to_departure,day_of_month,is_weekend,is_public_holiday,days_from_summer_start
count,92.0,92.0,92.0,92.0
mean,0.494565,0.282609,0.021739,0.5
std,0.296758,0.452735,0.14663,0.293429
min,0.0,0.0,0.0,0.0
25%,0.233333,0.0,0.0,0.25
50%,0.5,0.0,0.0,0.5
75%,0.741667,1.0,0.0,0.75
max,1.0,1.0,1.0,1.0


I tried smoothing missing values using interpolation and added a few time-related features.

The dataset is cleaner, but the model actually performed worse (higher MAE).

This suggests that smoothing removed real price jumps the model needs to learn from.

Even though the MAE went up, this version helps us understand how sensitive the model is to different preprocessing choices.

Next step is to try alternatives that keep the natural price movement while still reducing noise.

In [92]:
# Split the dataset the same way as v2 (70/15/15)
os.makedirs('data/splits_v3', exist_ok=True)

df_train_v3, df_temp_v3 = train_test_split(df_pivoted, test_size=0.3, random_state=42)
df_val_v3, df_test_v3 = train_test_split(df_temp_v3, test_size=0.5, random_state=42)

print("Train:", df_train_v3.shape)
print("Val:", df_val_v3.shape)
print("Test:", df_test_v3.shape)

Train: (64, 64)
Val: (14, 64)
Test: (14, 64)


In [93]:
df_train_v3.to_csv('data/splits_v3/train.csv')
df_val_v3.to_csv('data/splits_v3/val.csv')
df_test_v3.to_csv('data/splits_v3/test.csv')

print("Saved v3 splits.")

Saved v3 splits.


In [94]:
# =========================================================
# EXTRA FEATURES — v4 VERSION (calendar features)
# =========================================================
df_v4 = df_pivoted.copy()

# Rebuild full flight dates from the index
flightDate_full = pd.to_datetime('2022-' + df_v4.index)

# Extra feature 1: day of week as a smooth numeric feature (0–1)
df_v4["flight_day_of_week"] = flightDate_full.dayofweek / 6.0

# Extra feature 2: month as a smooth numeric feature (0–1)
df_v4["flight_month"] = (flightDate_full.month - 1) / 11.0

print("v4 dataframe shape (with extra features):", df_v4.shape)
df_v4.head()


v4 dataframe shape (with extra features): (92, 66)


days_to_departure,is_weekend,is_public_holiday,days_from_summer_start,day_of_month,1 days to departure,2 days to departure,3 days to departure,4 days to departure,5 days to departure,6 days to departure,...,53 days to departure,54 days to departure,55 days to departure,56 days to departure,57 days to departure,58 days to departure,59 days to departure,60 days to departure,flight_day_of_week,flight_month
flightDate_month_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06-01,0.0,0.0,0.0,0.0,417.09087,397.650952,420.162353,470.177,452.476957,527.084737,...,366.445238,366.445238,366.445238,366.445238,366.445238,366.445238,366.445238,366.445238,0.333333,0.454545
06-02,0.0,0.0,0.010989,0.033333,415.924167,403.580455,395.881111,434.208667,445.159744,485.058333,...,383.9125,383.9125,383.9125,383.9125,383.9125,383.9125,383.9125,383.9125,0.5,0.454545
06-03,0.0,0.0,0.021978,0.066667,454.933889,425.447826,418.852917,431.945,454.743939,474.713043,...,406.672917,406.672917,406.672917,406.672917,406.672917,406.672917,406.672917,406.672917,0.666667,0.454545
06-04,1.0,0.0,0.032967,0.1,542.497143,461.461111,468.624,463.863913,451.170417,484.1,...,424.955385,424.955385,424.955385,424.955385,424.955385,424.955385,424.955385,424.955385,0.833333,0.454545
06-05,1.0,0.0,0.043956,0.133333,879.026154,573.712174,667.654,634.514286,591.014643,623.223077,...,416.481111,416.481111,416.481111,416.481111,416.481111,416.481111,416.481111,416.481111,1.0,0.454545


In [95]:
# =========================================================
# Split and save v4 dataset
# =========================================================
os.makedirs("data/splits_v4", exist_ok=True)

df_train_v4, df_temp_v4 = train_test_split(df_v4, test_size=0.3, random_state=42)
df_val_v4, df_test_v4 = train_test_split(df_temp_v4, test_size=0.5, random_state=42)

df_train_v4.to_csv("data/splits_v4/train.csv")
df_val_v4.to_csv("data/splits_v4/val.csv")
df_test_v4.to_csv("data/splits_v4/test.csv")

print("Saved v4 splits.")
print("Train v4:", df_train_v4.shape)
print("Val v4:", df_val_v4.shape)
print("Test v4:", df_test_v4.shape)


Saved v4 splits.
Train v4: (64, 66)
Val v4: (14, 66)
Test v4: (14, 66)


In [96]:
# =========================================================
# CREATE v5 VERSION (LOG-TRANSFORMED TARGETS)
# =========================================================

df_v5 = df_v4.copy()

# Apply log1p (safe for zero prices)
price_cols = [c for c in df_v5.columns if "days to departure" in c]
df_v5[price_cols] = np.log1p(df_v5[price_cols])

print("v5 dataframe shape:", df_v5.shape)

save_dir = "aml_project/data/splits_v5"
os.makedirs(save_dir, exist_ok=True)

df_train_v5, df_temp_v5 = train_test_split(df_v5, test_size=0.3, random_state=42)
df_val_v5, df_test_v5 = train_test_split(df_temp_v5, test_size=0.5, random_state=42)

df_train_v5.to_csv(f"{save_dir}/train.csv")
df_val_v5.to_csv(f"{save_dir}/val.csv")
df_test_v5.to_csv(f"{save_dir}/test.csv")

print("Saved v5 splits (log-transformed prices).")


v5 dataframe shape: (92, 66)
Saved v5 splits (log-transformed prices).


In [97]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# =======================
# LOAD BASE DATA (same as v2-v5)
# =======================
df = pd.read_csv("aml_project/data/preprocessed_dataset.csv")
df = df[['searchDate', 'flightDate', 'totalFare']].copy()

df_grouped = df.groupby(['searchDate', 'flightDate'])['totalFare'].mean().reset_index()
df_grouped['searchDate'] = pd.to_datetime(df_grouped['searchDate'])
df_grouped['flightDate'] = pd.to_datetime(df_grouped['flightDate'])
df_grouped['days_to_departure'] = (df_grouped['flightDate'] - df_grouped['searchDate']).dt.days

df_grouped['flightDate_month_day'] = df_grouped['flightDate'].dt.strftime('%m-%d')

df_pivoted = df_grouped.pivot(
    index='flightDate_month_day',
    columns='days_to_departure',
    values='totalFare'
)

numeric_cols = sorted([int(c) for c in df_pivoted.columns], reverse=True)
df_pivoted = df_pivoted.rename(columns={d: f"{d} days to departure" for d in numeric_cols})

# =======================
# ADD BASIC FEATURES (same as v4)
# =======================
flightDate = pd.to_datetime("2022-" + df_pivoted.index)

df_pivoted["day_of_month"] = flightDate.day
df_pivoted["is_weekend"] = flightDate.dayofweek.isin([5, 6]).astype(int)
df_pivoted["is_public_holiday"] = flightDate.isin(
    pd.to_datetime(["2022-06-20", "2022-07-04"])
).astype(int)
df_pivoted["days_from_summer_start"] = (flightDate - pd.to_datetime("2022-06-21")).days
df_pivoted["flight_day_of_week"] = flightDate.dayofweek / 6.0
df_pivoted["flight_month"] = (flightDate.month - 1) / 11.0

# =======================
# v6 EXTRA FEATURES (HUGE IMPROVEMENT)
# =======================
df_pivoted["week_of_year"] = flightDate.isocalendar().week
df_pivoted["season"] = (flightDate.month % 12 + 3) // 3

# =======================
# NORMALIZE DATE FEATURES
# =======================
feature_cols = [
    "day_of_month", 
    "is_weekend",
    "is_public_holiday",
    "days_from_summer_start",
    "flight_day_of_week",
    "flight_month",
    "week_of_year",
    "season"
]

scaler = MinMaxScaler()
df_pivoted[feature_cols] = scaler.fit_transform(df_pivoted[feature_cols])

# =======================
# v6 PRICE CLEANING (BEST METHOD)
# =======================
price_cols = [c for c in df_pivoted.columns if "days to departure" in c]

df_pivoted[price_cols] = (
    df_pivoted[price_cols]
    .interpolate(axis=0)
    .ffill()
    .bfill()
)

# =======================
# v6 TREND FEATURES
# =======================
df_pivoted["mean_price"] = df_pivoted[price_cols].mean(axis=1)
df_pivoted["slope_1"] = df_pivoted["mean_price"].diff()
df_pivoted["slope_2"] = df_pivoted["mean_price"].diff(2)

df_pivoted.fillna(0, inplace=True)

# =======================
# SAVE SPLITS
# =======================
os.makedirs("aml_project/data/splits_v6", exist_ok=True)

train_v6, temp_v6 = train_test_split(df_pivoted, test_size=0.3, random_state=42)
val_v6, test_v6 = train_test_split(temp_v6, test_size=0.5, random_state=42)

train_v6.to_csv("aml_project/data/splits_v6/train.csv")
val_v6.to_csv("aml_project/data/splits_v6/val.csv")
test_v6.to_csv("aml_project/data/splits_v6/test.csv")

print("v6 preprocessing complete. Shapes:")
print("Train:", train_v6.shape)
print("Val:", val_v6.shape)
print("Test:", test_v6.shape)

v6 preprocessing complete. Shapes:
Train: (64, 71)
Val: (14, 71)
Test: (14, 71)


  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
