# Loading the dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dilwong/flightprices")

print("Path to dataset files:", path)

In [86]:
import pandas as pd

df = pd.read_csv("data/itineraries.csv")
df.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,...,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode
0,9ca0e81111c683bec1012473feefd28f,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H29M,0,False,False,...,1650223560,2022-04-17T15:26:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,8940,947,coach
1,98685953630e772a098941b71906592b,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H30M,0,False,False,...,1650200400,2022-04-17T09:00:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9000,947,coach
2,98d90cbc32bfbb05c2fc32897c7c1087,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H30M,0,False,False,...,1650218700,2022-04-17T14:05:00.000-04:00,BOS,ATL,Delta,DL,Boeing 757-200,9000,947,coach
3,969a269d38eae583f455486fa90877b4,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H32M,0,False,False,...,1650227460,2022-04-17T16:31:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9120,947,coach
4,980370cf27c89b40d2833a1d5afc9751,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H34M,0,False,False,...,1650213180,2022-04-17T12:33:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9240,947,coach


# Pre-processing

## List out the most popular routes

In [87]:
from collections import Counter

route_counts = Counter()

for chunk in pd.read_csv("data/itineraries.csv", usecols=["startingAirport", "destinationAirport"], chunksize=500_000):
    chunk = chunk.dropna(subset=["startingAirport", "destinationAirport"])

    routes = chunk["startingAirport"] + "-" + chunk["destinationAirport"]

    route_counts.update(routes)

route_df = pd.DataFrame(route_counts.most_common(20), columns=["Route", "Count"])
print(route_df)

      Route   Count
0   ATL-LAX  709809
1   LAX-BOS  679169
2   LGA-LAX  677713
3   LAX-ATL  669609
4   LAX-LGA  663659
5   BOS-LAX  644390
6   LAX-JFK  625496
7   LAX-ORD  620576
8   DFW-LAX  612390
9   LAX-DFW  610669
10  JFK-LAX  605017
11  LAX-DTW  601537
12  ORD-LAX  597847
13  LAX-EWR  587270
14  DTW-LAX  582022
15  CLT-LAX  572097
16  JFK-ORD  557152
17  LAX-CLT  554474
18  LGA-ORD  550319
19  LAX-PHL  549880


## List out the most popular airlines on LAX - JFK

In [88]:
origin = "LAX"
destination = "JFK"

airline_counts = Counter()

for chunk in pd.read_csv("data/itineraries.csv", usecols=["startingAirport", "destinationAirport", "segmentsAirlineName"], chunksize=500_000):
    chunk = chunk.dropna(subset=["startingAirport", "destinationAirport", "segmentsAirlineName"])

    mask = (chunk["startingAirport"] == origin) & (chunk["destinationAirport"] == destination)
    airline_counts.update(chunk.loc[mask, "segmentsAirlineName"])

airline_df = pd.DataFrame(airline_counts.most_common(), columns=["Airline", "FlightCount"])
print(airline_df.head(10))

                                Airline  FlightCount
0  American Airlines||American Airlines       151007
1                       JetBlue Airways       113339
2                     American Airlines        93532
3      Alaska Airlines||Alaska Airlines        84522
4                                 Delta        71229
5                          Delta||Delta        41616
6      JetBlue Airways||JetBlue Airways        23101
7                                United        16705
8                        United||United        14394
9               United||Alaska Airlines         7268


## Only keep LAX - JFK American Airlines flights

In [89]:
origin = "LAX"
destination = "JFK"
airline = "American Airlines"

chunks = []

for chunk in pd.read_csv("data/itineraries.csv", chunksize=500_000):
    mask = (
        (chunk["startingAirport"] == origin)
        & (chunk["destinationAirport"] == destination)
        & (chunk["segmentsAirlineName"].str.contains(airline, case=False, na=False))
    )
    filtered = chunk.loc[mask].copy()
    if not filtered.empty:
        chunks.append(filtered)

df = pd.concat(chunks)
print(df.shape)

(246932, 27)


## List out missing values

In [90]:
missing = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing / len(df)) * 100

missing_summary = pd.DataFrame({
    "Missing Values": missing,
    "Percent Missing": missing_percent.round(2)
})

print(missing_summary.head(15))

                                   Missing Values  Percent Missing
totalTravelDistance                          1432             0.58
segmentsDistance                              358             0.14
legId                                           0             0.00
segmentsDurationInSeconds                       0             0.00
segmentsEquipmentDescription                    0             0.00
segmentsAirlineCode                             0             0.00
segmentsAirlineName                             0             0.00
segmentsDepartureAirportCode                    0             0.00
segmentsArrivalAirportCode                      0             0.00
segmentsArrivalTimeRaw                          0             0.00
segmentsArrivalTimeEpochSeconds                 0             0.00
segmentsDepartureTimeRaw                        0             0.00
segmentsDepartureTimeEpochSeconds               0             0.00
seatsRemaining                                  0             

## Drop rows with missing values and duplicates

In [91]:
df = df.dropna(subset=["totalTravelDistance", "segmentsDistance"])
df = df.drop_duplicates()
print(df.shape)

(245500, 27)


## Further pre-process
Since only 5% are economy tickets, drop those rows and only care about non-economy fares)

In [92]:
cols = df.columns.tolist()
df_essential = df[["searchDate", "flightDate", "totalFare", "isBasicEconomy"]]
df_non_basic_economy = df_essential[df_essential['isBasicEconomy'] == False]
df_non_basic_economy = df_non_basic_economy.drop(columns=['isBasicEconomy'])

print(df_non_basic_economy.head())
print(df_non_basic_economy.shape)

      searchDate  flightDate  totalFare
4815  2022-04-16  2022-04-17      366.6
4818  2022-04-16  2022-04-17      366.6
4820  2022-04-16  2022-04-17      366.6
4822  2022-04-16  2022-04-17      366.6
4823  2022-04-16  2022-04-17      366.6
(230737, 3)


Since the dataset starts with flights at April 17, there is no search price data earlier than that (e.g. the flight on 17th of April only has search price history for the 17th of April). Therefore, dropped all the rows with flightDate before June 1 (example: year-06-01). Since we only want to predict for summer, also dropped all the tables after August 31th)

In [93]:
df_non_basic_economy['flightDate'] = pd.to_datetime(df_non_basic_economy['flightDate'])
df_non_basic_economy_filtered = df_non_basic_economy[df_non_basic_economy['flightDate'] >= pd.to_datetime('2022-06-01')]
df = df_non_basic_economy_filtered[df_non_basic_economy_filtered['flightDate'] <= pd.to_datetime('2022-08-31')]

print(df)
print(df.shape)

          searchDate flightDate  totalFare
485631    2022-04-17 2022-06-01     272.60
485638    2022-04-17 2022-06-01     281.60
485639    2022-04-17 2022-06-01     281.60
485640    2022-04-17 2022-06-01     281.60
485650    2022-04-17 2022-06-01     328.60
...              ...        ...        ...
64430406  2022-08-30 2022-08-31     527.60
64430411  2022-08-30 2022-08-31    1412.61
64430412  2022-08-30 2022-08-31    1738.60
64430413  2022-08-30 2022-08-31    1738.60
64430414  2022-08-30 2022-08-31    1738.60

[130450 rows x 3 columns]
(130450, 3)


## Group and Aggregate
Group rows by searchDate and flightDate, then average the totalFare for each unique combination.


In [94]:
df_grouped = df_filtered.groupby(['searchDate', 'flightDate'])['totalFare'].mean().reset_index()
print(f"Grouped dataset shape: {df_grouped.shape}")
print(f"Reduced from {len(df_filtered)} rows to {len(df_grouped)} rows")
df_grouped.iloc[0:25]

Grouped dataset shape: (4599, 3)
Reduced from 130450 rows to 4599 rows


Unnamed: 0,searchDate,flightDate,totalFare
0,2022-04-17,2022-06-01,366.445238
1,2022-04-17,2022-06-02,383.9125
2,2022-04-17,2022-06-03,406.672917
3,2022-04-17,2022-06-04,424.955385
4,2022-04-17,2022-06-05,416.481111
5,2022-04-17,2022-06-06,398.401143
6,2022-04-17,2022-06-07,341.123462
7,2022-04-17,2022-06-08,362.764
8,2022-04-17,2022-06-09,403.721212
9,2022-04-17,2022-06-10,416.877097


## Calculate Days to Departure
Compute the number of days between the search date and flight date for each record.


In [95]:
# Calculate days between search date and flight date
df_grouped['searchDate'] = pd.to_datetime(df_grouped['searchDate'])
df_grouped['flightDate'] = pd.to_datetime(df_grouped['flightDate'])
df_grouped['days_to_departure'] = (df_grouped['flightDate'] - df_grouped['searchDate']).dt.days
df_grouped.iloc[0:25]

Unnamed: 0,searchDate,flightDate,totalFare,days_to_departure
0,2022-04-17,2022-06-01,366.445238,45
1,2022-04-17,2022-06-02,383.9125,46
2,2022-04-17,2022-06-03,406.672917,47
3,2022-04-17,2022-06-04,424.955385,48
4,2022-04-17,2022-06-05,416.481111,49
5,2022-04-17,2022-06-06,398.401143,50
6,2022-04-17,2022-06-07,341.123462,51
7,2022-04-17,2022-06-08,362.764,52
8,2022-04-17,2022-06-09,403.721212,53
9,2022-04-17,2022-06-10,416.877097,54


## Pivot Data
Reshape the data with flightDate_month_day as rows and days_to_departure as columns for multi-output regression.


In [96]:
# Format flightDate to only show month and day (no year)
df_grouped['flightDate_month_day'] = df_grouped['flightDate'].dt.strftime('%m-%d')

# Pivot the data: flightDate_month_day as rows, days_to_departure as columns
df_pivoted = df_grouped.pivot(index='flightDate_month_day', columns='days_to_departure', values='totalFare')
numeric_cols = sorted([int(col) for col in df_pivoted.columns], reverse=True)

# Create column mapping
column_mapping = {}
for days in numeric_cols:
    column_mapping[days] = f"{days} days to departure"

# Then rename columns
df_pivoted = df_pivoted.rename(columns=column_mapping)

df_pivoted.iloc[0:25]


days_to_departure,1 days to departure,2 days to departure,3 days to departure,4 days to departure,5 days to departure,6 days to departure,7 days to departure,8 days to departure,9 days to departure,10 days to departure,...,51 days to departure,52 days to departure,53 days to departure,54 days to departure,55 days to departure,56 days to departure,57 days to departure,58 days to departure,59 days to departure,60 days to departure
flightDate_month_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06-01,417.09087,397.650952,420.162353,470.177,452.476957,527.084737,493.403158,527.262609,495.3625,476.796667,...,,,,,,,,,,
06-02,415.924167,403.580455,395.881111,434.208667,445.159744,485.058333,476.95,500.29375,491.063333,434.014074,...,,,,,,,,,,
06-03,454.933889,425.447826,418.852917,431.945,454.743939,474.713043,480.011923,482.575,497.086897,494.309565,...,,,,,,,,,,
06-04,542.497143,461.461111,468.624,463.863913,451.170417,484.1,492.108,473.958929,477.646154,465.987273,...,,,,,,,,,,
06-05,879.026154,573.712174,667.654,634.514286,591.014643,623.223077,628.157895,628.789474,708.747059,647.606429,...,,,,,,,,,,
06-06,558.055714,550.206,530.795238,549.848462,556.404839,531.68871,508.586957,561.983333,572.066667,590.309091,...,,,,,,,,,,
06-07,599.839524,891.2448,1399.859091,799.3608,914.669643,563.844444,847.217308,595.122222,970.373077,852.691304,...,341.123462,,,,,,,,,
06-08,1507.860455,1520.783913,1378.587241,1250.355714,1721.851364,1427.6,1417.504,1051.26913,774.063182,878.011111,...,355.955172,362.764,,,,,,,,
06-09,1173.795625,1297.605789,1499.241579,1249.956538,1318.926667,1239.577778,1392.407692,1049.382273,676.377813,659.13,...,442.599677,,403.721212,,,,,,,
06-10,818.848333,1485.269167,1157.506667,1681.200667,1033.963478,2016.611111,1245.33,818.877778,707.129412,649.574,...,,426.145,,416.877097,,,,,,


## Add Date-Based Features

Additional date-based features are added to capture weekly and seasonal
patterns in flight pricing.

In [97]:
flightDate = pd.to_datetime('2022-' + df_pivoted.index)


is_weekend = flightDate.dayofweek.isin([5, 6]).astype(int)


us_2022_holidays = pd.to_datetime(['2022-06-20', '2022-07-04'])
is_public_holiday = flightDate.isin(us_2022_holidays).astype(int)

summer_start = pd.to_datetime('2022-06-21')
days_from_summer_start = (flightDate - summer_start).days

day_of_month = flightDate.day

df_pivoted.insert(0, 'is_weekend', is_weekend)
df_pivoted.insert(1, 'is_public_holiday', is_public_holiday)
df_pivoted.insert(2, 'days_from_summer_start', days_from_summer_start)
df_pivoted.insert(3, 'day_of_month', day_of_month)
df_pivoted.insert(4, "flight_day_of_week", flightDate.dayofweek)
df_pivoted.insert(5, "flight_month", flightDate.month)

print(f"Updated dataframe shape: {df_pivoted.shape}")
print(f"New columns: {list(df_pivoted.columns[:6])}")

df_pivoted.iloc[0:25]

Updated dataframe shape: (92, 66)
New columns: ['is_weekend', 'is_public_holiday', 'days_from_summer_start', 'day_of_month', 'flight_day_of_week', 'flight_month']


days_to_departure,is_weekend,is_public_holiday,days_from_summer_start,day_of_month,flight_day_of_week,flight_month,1 days to departure,2 days to departure,3 days to departure,4 days to departure,...,51 days to departure,52 days to departure,53 days to departure,54 days to departure,55 days to departure,56 days to departure,57 days to departure,58 days to departure,59 days to departure,60 days to departure
flightDate_month_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
06-01,0,0,-20,1,2,6,417.09087,397.650952,420.162353,470.177,...,,,,,,,,,,
06-02,0,0,-19,2,3,6,415.924167,403.580455,395.881111,434.208667,...,,,,,,,,,,
06-03,0,0,-18,3,4,6,454.933889,425.447826,418.852917,431.945,...,,,,,,,,,,
06-04,1,0,-17,4,5,6,542.497143,461.461111,468.624,463.863913,...,,,,,,,,,,
06-05,1,0,-16,5,6,6,879.026154,573.712174,667.654,634.514286,...,,,,,,,,,,
06-06,0,0,-15,6,0,6,558.055714,550.206,530.795238,549.848462,...,,,,,,,,,,
06-07,0,0,-14,7,1,6,599.839524,891.2448,1399.859091,799.3608,...,341.123462,,,,,,,,,
06-08,0,0,-13,8,2,6,1507.860455,1520.783913,1378.587241,1250.355714,...,355.955172,362.764,,,,,,,,
06-09,0,0,-12,9,3,6,1173.795625,1297.605789,1499.241579,1249.956538,...,442.599677,,403.721212,,,,,,,
06-10,0,0,-11,10,4,6,818.848333,1485.269167,1157.506667,1681.200667,...,,426.145,,416.877097,,,,,,


## Handle Missing Values in Fare Curves

In [98]:
target_cols = [c for c in df_pivoted.columns if "days to departure" in c]

df_pivoted[target_cols] = df_pivoted[target_cols].interpolate(axis=1)

df_pivoted[target_cols] = df_pivoted[target_cols].fillna(df_pivoted[target_cols].median())

## Normalize Continuous Features Only

Continuous input features are scaled to the [0, 1] range.

In [99]:
from sklearn.preprocessing import MinMaxScaler

continuous_features = [
    "day_of_month",
    "days_from_summer_start",
    "flight_day_of_week",
    "flight_month",
]

scaler = MinMaxScaler()
df_pivoted[continuous_features] = scaler.fit_transform(df_pivoted[continuous_features])

## Split Dataset
Split the dataset into train (70%), validation (15%), and test (15%) sets using random sampling.


In [100]:
df_for_split = df_pivoted

# Split dataset into train, validation, and test sets
from sklearn.model_selection import train_test_split

# First split: 70% train, 30% temp
df_train, df_temp = train_test_split(
    df_for_split,     # use final cleaned dataframe
    test_size=0.3,
    random_state=42
)

# Second split: temp -> 15% val, 15% test
df_val, df_test = train_test_split(
    df_temp,
    test_size=0.5,
    random_state=42
)

# Display shapes
print(f"Train set shape: {df_train.shape}")
print(f"Validation set shape: {df_val.shape}")
print(f"Test set shape: {df_test.shape}")

print("\nSplit percentages:")
print(f"Train: {len(df_train) / len(df_for_split) * 100:.1f}% ({len(df_train)} rows)")
print(f"Validation: {len(df_val) / len(df_for_split) * 100:.1f}% ({len(df_val)} rows)")
print(f"Test: {len(df_test) / len(df_for_split) * 100:.1f}% ({len(df_test)} rows)")


Train set shape: (64, 66)
Validation set shape: (14, 66)
Test set shape: (14, 66)

Split percentages:
Train: 69.6% (64 rows)
Validation: 15.2% (14 rows)
Test: 15.2% (14 rows)


## Save Splits
Export the train, validation, and test splits to CSV files in the data/splits folder.


In [101]:
# Save splits
df_train.to_csv('data/splits/train.csv', index=True)
df_val.to_csv('data/splits/val.csv', index=True)
df_test.to_csv('data/splits/test.csv', index=True)