# Part 1) Booking Curve Clusters

In [0]:
# Import Libraries

from pyspark.sql import functions as F
from pyspark.sql.functions import weekofyear
from pyspark.sql.window import Window
from datetime import datetime

# Import Data

sales_history = spark.read.table('data_experience_commercial.cbt_1423_rtsuite.master_uat').select('flightkey', F.col('charge_dt').cast('date'), 'unt_pre', 'rev_pre', 'chargeproduct', 'dtg', 'bkg_is_ejhsss', 'reservationid')
dimensions_history = spark.read.table('data_experience_commercial.cbt_0923_segmentfinder.dimensions_history').select('flightkey', 'onsale_dt', 'ty_capacity', 'parentregion', F.col('flight_dt').cast('date'), 'sector', F.col('flight_wk').cast('date')).withColumn('WoY', weekofyear(F.col('flight_wk')))

# Filter Data

filtered_sales = sales_history.filter((F.col('chargeproduct')=='Ticket') & (F.col('dtg')>=0) & (F.col('charge_dt')>='2023-01-01') & (F.col('charge_dt')<='2025-10-31') & (F.col('bkg_is_ejhsss') == 0)) # ticket only + eliminate covid, ss, and not yet flown
filtered_dimensions = dimensions_history.filter((F.datediff(F.col('flight_dt'), F.col('onsale_dt')) >= 168) & (F.col('flight_dt') >= '2024-10-01') & (F.col('flight_dt') <= '2025-09-30')) # eliminate late top-ups, filter for LY flight dates 

# Preprocessing - filling missing charge dates

dsh = filtered_sales.join(filtered_dimensions, on='flightkey', how='inner') # join tables
dshsmooth = dsh.groupby('flightkey','charge_dt').agg(F.sum('unt_pre').alias('unt_pre'), F.first('onsale_dt').alias('onsale_dt'), F.first('flight_dt').alias('flight_dt')) # aggregate into daily flight sales
date_range = dshsmooth.groupBy('flightkey').agg(F.min('onsale_dt').alias('start_date'), F.least(F.first('flight_dt'), F.lit(datetime.now().date())).alias('end_date')) # define flight onsale period
index = date_range.withColumn('charge_dt_ts', F.explode(F.sequence(F.col('start_date'), F.col('end_date')))).withColumn('charge_dt', F.col('charge_dt_ts').cast('date')) # create index of dates between onsale and flight date
dshjoin = index.join(dshsmooth, on=['flightkey', 'charge_dt'], how='left').drop('start_date', 'end_date','onsale_dt','flight_dt','charge_dt_ts').fillna(0) # join index with daily sales
window_spec = Window.partitionBy('flightkey').orderBy(F.col('charge_dt')) # create window for rolling pax sum
dsh_pax = dshjoin.withColumn('pax_net', F.sum('unt_pre').over(window_spec)) # calculate current pax sum - currently neglects cancellations

# Preprocessing - creating LF by dtg progress remaining curves for each sector week

final_dsh = dsh_pax.join(filtered_dimensions, on='flightkey', how='left').drop('unt_net') # join daily sales with dimensions
curves = final_dsh.withColumn('total_booking_days', F.datediff(F.col('flight_dt'), F.col('onsale_dt'))) # calculate on sale period length in days
curves = curves.withColumn('dtg', F.datediff(F.col('flight_dt'), F.col('charge_dt'))) # calculate dtg
normal_curves = curves.withColumn('dtg_pr', F.col('dtg') / F.col('total_booking_days')) # express dtg progress remaining as dtg as a fraction of total booking days
normal_curve_buckets = normal_curves.withColumn('dtg_bucket', (F.floor(F.col('dtg_pr') * 100)).cast('int'))  # split dtg_pr into percentile buckets
aggregated_normal_curves = normal_curve_buckets.groupby('parentregion', 'sector', 'WoY', 'dtg_bucket').agg(F.sum('ty_capacity').alias('ty_capacity'), F.sum('pax_net').alias('pax_net')).orderBy('dtg_bucket') # aggregate by sector week and dtg bucket
df = aggregated_normal_curves.withColumn('load_factor', F.col('pax_net')/F.col('ty_capacity')).drop('ty_capacity', 'pax_net').toPandas() # create pandas dataframe of LF by dtg progress remaining by sector week

# storing original dataframe

df.info()
df_original = df.copy()

In [0]:
df = df_original.copy()

In [0]:
df_pivot = df.pivot_table(index=['sector', 'WoY'], columns='dtg_bucket', values='load_factor', aggfunc='mean').fillna(0)
df_cluster = df_pivot[df_pivot[1] > 0]
df_cluster.info()

In [0]:
# Removing all routes that didnt have final LF > 60%

df_sold = df_cluster[df_cluster[1] > 0.6]
df_sold.info()

In [0]:
# Expressing Load Factor as a function of final LF
df_sold = df_sold.div(df_sold[1], axis=0)

In [0]:
# initial LF set to 0

row_min = df_sold.min(axis=1)
row_max = df_sold.max(axis=1)

df_sold = df_sold.subtract(row_min, axis=0).divide(row_max - row_min, axis=0)
df_sold.head()

In [0]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
k_range = range(2, 11)
wcss = []
for i in k_range: 
    kmeans = KMeans(n_clusters = i, init = 'k-means++', n_init=10)
    kmeans.fit(df_sold)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(k_range) 
plt.grid(True)
plt.show()

In [0]:
df_sold_original = df_sold.copy()

In [0]:
df_sold = df_sold_original.copy()

In [0]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
kmeans = KMeans(n_clusters = 5, init = 'k-means++', n_init=10)
kmeans.fit(df_sold)
df_sold['cluster_label'] = kmeans.labels_   
curves_clusters = df_sold.groupby('cluster_label').mean()

curves_clusters.T.plot(figsize=(20, 8))

plt.xlabel('% DTG remaining')
plt.ylabel('LF / LF_final')
plt.title('Isolated booking curve shape clustering "Banana split"')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(title=' # Cluster')
plt.show()

In [0]:
cluster_counts = df_sold['cluster_label'].value_counts()
cluster_counts

In [0]:
df_sold.head()

# Part 2) Customer Segmentation

In [0]:
reservation_attributes = spark.read.table('data_experience_commercial.cbt_1470_reservationattributes.output').select('*')

reservation_attributes_dimesnions = reservation_attributes.join(dsh, on='reservationid', how='inner')
sector_week_attributes = reservation_attributes_dimesnions.groupBy('parentregion','sector', 'WoY').agg(F.avg('is_active').alias('avg_is_active'), F.avg('has_flown').alias('avg_has_flown'), F.avg('is_oneway').alias('avg_is_oneway'), F.avg('is_web').alias('avg_is_web'), F.avg('is_mobileapp').alias('avg_is_mobileapp'),
                                                                                        F.avg('is_staffbooking').alias('avg_is_staffbooking'), F.avg('has_adult').alias('avg_has_adult'), F.avg('has_child').alias('avg_has_child'), F.avg('has_infant').alias('avg_has_infant'), F.avg('has_ejplus').alias('avg_has_ejplus'), F.avg('has_flexi').alias('avg_has_flexi'), F.avg('has_ejplusdiscount').alias('avg_has_ejplusdiscount'), F.avg('has_allocatedseat').alias('avg_has_allocatedseat'), F.avg('has_standardseat').alias('avg_has_standardseat'), F.avg('has_premiumseat').alias('avg_has_premiumseat'), F.avg('has_largecabinbag').alias('avg_has_largecabinbag'), F.avg('has_holdbag').alias('avg_has_holdbag'), F.avg('has_holdbaglug').alias('avg_has_holdbaglug'), F.avg('has_holdbaglus').alias('avg_has_holdbaglus'), F.avg('has_sportsequipment').alias('avg_has_sportsequipment'), F.avg('has_sportsequipmentlarge').alias('avg_has_sportequipmentlarge'), F.avg('has_sportsequipmentsmall').alias('avg_has_sportsequipmentsmall'), F.avg('has_inflightvoucher').alias('avg_has_inflightvoucher'), F.avg('has_carrental').alias('avg_has_carrental'), F.avg('has_travelinsurance').alias('avg_has_travelinsurance'))

df2 = sector_week_attributes.toPandas()
df2_original = df2.copy()

df2.info()

In [0]:
df2 = df2_original.copy()

In [0]:
df2.set_index(['parentregion','sector', 'WoY'], inplace=True)
df2.head()

In [0]:
'''from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for col in df2.columns:
    df2[col] = scaler.fit_transform(df2[col].values.reshape(-1, 1))
df2.head()'''

In [0]:
from sklearn.decomposition import PCA
import numpy as np
pca = PCA(n_components=8)
principal_components = pca.fit_transform(df2)
var = pca.explained_variance_ratio_[:8]
cumvar = np.cumsum(var)[:8]
plt.figure(figsize=(10, 6))
plt.bar(range(1, 9), var, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1, 9), cumvar, where='mid', label='Cumulative explained variance')
plt.xticks(range(1, 9))
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.title('Explained Variance by Principal Components')
plt.legend()
plt.show()

In [0]:
import pandas as pd
final_pca = PCA(n_components=5)
final_principal_components = final_pca.fit_transform(df2)
df_pca = pd.DataFrame(data=final_principal_components, index=df2.index, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
df_pca.head()

In [0]:
import seaborn as sns
from adjustText import adjust_text

features = df2.columns
loadings = pd.DataFrame(final_pca.components_.T, columns=['PC1', 'PC2'], index=features)

sns.set_style('whitegrid')
plt.figure(figsize=(20, 10))
plt.scatter(final_principal_components[:, 0], final_principal_components[:, 1], alpha=0.3, c='lightgrey', edgecolors='none', label='Flights')

scale_factor = 2.5
texts = []  
loadings['Vector_Length'] = np.sqrt(loadings['PC1']**2 + loadings['PC2']**2)
top_features = loadings.nlargest(10, 'Vector_Length')

for feature_name, row in loadings.iterrows():

    plt.arrow(0, 0, row['PC1']*scale_factor, row['PC2']*scale_factor, color='#d62728', alpha=0.8, head_width=0.05, width=0.005)
    texts.append(plt.text(row['PC1']*scale_factor * 1.1, row['PC2']*scale_factor * 1.1, feature_name, color='darkred', fontsize=11, weight='bold'))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='grey', alpha=0.5))

plt.xlabel(f'PC1 (Main Variant)', fontsize=12)
plt.ylabel(f'PC2 (Secondary Variant)', fontsize=12)
plt.title('Top Drivers of Flight Segments (PCA Biplot)', fontsize=15, pad=20)
sns.despine() # Removes the top and right box lines

plt.show()

In [0]:
plt.figure(figsize=(10, 18))
sns.heatmap(loadings, annot=True, cmap='viridis', center=0)
plt.title('PCA Loadings Heatmap')
plt.show()

In [0]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
k_range = range(1, 11)
wcss = []
for i in k_range: 
    kmeans = KMeans(n_clusters = i, init = 'k-means++', n_init=10)
    kmeans.fit(df_pca)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(k_range, wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Inertia)')
plt.xticks(k_range) 
plt.grid(True)
plt.show()

In [0]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', n_init=10)
kmeans.fit(df_pca)
df_pca['cluster_label2'] = kmeans.labels_
df_pca.head()

In [0]:
import pandas as pd
df_pca = pd.DataFrame(df_pca['cluster_label2'])
df_pca.head()

In [0]:
df_heatmap = df_pca.join(df2, on=['sector', 'WoY'], how='left')
df_heatmap.head()

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

global_mean = df_heatmap.mean()
cluster_means = df_heatmap.groupby('cluster_label2').mean()
relative_imp = (cluster_means - global_mean) / (global_mean + 1e-5)
relative_imp.drop('cluster_label2', axis=1)

plt.figure(figsize=(24, 8)) 

sns.heatmap(data=relative_imp, annot=True, fmt='.0%', cmap='viridis', center=0, vmin=-1.0, vmax=1.0, linewidths=0.5, linecolor='white', cbar_kws={'label': 'Relative Importance Scale'}, annot_kws={"size": 11, "weight": "bold"})

plt.title('Feature Importance Score Card\n(How much each cluster differs from the Global Average)', fontsize=18, pad=20)
plt.ylabel('Cluster Label', fontsize=14, weight='bold')
plt.xlabel('Booking Feature', fontsize=14, weight='bold')
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)

sns.despine(top=True, right=True, left=True, bottom=True)

plt.tight_layout()
plt.show()

In [0]:
df_sold.head()

In [0]:
df_combined = df_sold.join(df_pca, on=['sector', 'WoY'], how='left')
df_combined.head()

In [0]:
df_combined = df_combined.drop(['cluster_label'], axis=1)
df_combined.head()

In [0]:
curves_clusters2 = df_combined.groupby('cluster_label2').mean()

curves_clusters2.T.plot(figsize=(10, 4))

plt.xlabel('% DTG remaining')
plt.ylabel('LF / LF_final')
plt.title('Isolated booking curve shape clustering')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend(title=' # Cluster')
plt.show()

In [0]:
cluster_counts2 = df_final['cluster_label2'].value_counts()
cluster_counts2

# Part 3) Coming together: Hierarchical Booking Curve - Customer Mix Clusters

In [0]:
df_final = df_original.copy()

In [0]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def cluster_parent_region(df, region):

    df_final = df[df['parentregion'] == region]
    df_pivot = df_final.pivot_table(index=['parentregion','sector', 'WoY'], columns='dtg_bucket', values='load_factor', aggfunc='mean').fillna(0)
    df_sold = df_pivot[df_pivot[1] > 0.6]
    df_sold = df_sold.div(df_sold[1], axis=0)
    row_min = df_sold.min(axis=1)
    row_max = df_sold.max(axis=1)
    df_sold = df_sold.subtract(row_min, axis=0).divide(row_max - row_min, axis=0)

    kmeans = KMeans(n_clusters = 5, init = 'k-means++', n_init=10)
    kmeans.fit(df_sold)

    df_sold['cluster_label'] = kmeans.labels_   
    curves_clusters = df_sold.groupby('cluster_label').mean()
    
    return df_sold

In [0]:
df_final_UK = cluster_parent_region(df_final, 'UK')
df_final_EU = cluster_parent_region(df_final, 'Europe')

In [0]:
df_final_UK.head()

In [0]:
df_final_UK.info()

In [0]:
df_final_EU.head()

In [0]:
df_final_EU.info()

In [0]:
def banana_split_plot (df, region):
    
    curves_clusters = df.groupby('cluster_label').mean()
    curves_clusters.T.plot(figsize=(20, 8))

    plt.xlabel('% DTG remaining')
    plt.ylabel('LF / LF_final')
    plt.title(f'{region} booking curve shape clustering "Banana split"')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend(title=' # Cluster')
    return plt.show()

In [0]:
banana_split_plot(df_final_UK, 'UK')

In [0]:
banana_split_plot(df_final_EU, 'Europe')

In [0]:
def customer_segmentation(df, df2, region):

    final_pca = PCA(n_components=5)
    final_principal_components = final_pca.fit_transform(df2)
    df_pca = pd.DataFrame(data=final_principal_components, index=df2.index, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
    df_combined = pd.DataFrame(df['cluster_label']).join(df_pca, on=['parentregion','sector', 'WoY'], how='left')
    
    for i in range (0,5):
        cluster_df = df_combined[df_combined['cluster_label'] == i]
        cluster_df = cluster_df.drop(['cluster_label'], axis=1)
        kmeans = KMeans(n_clusters = 4, init = 'k-means++', n_init=10)
        kmeans.fit(cluster_df)
        df_pca['cluster_label2'] = kmeans.labels_
        df_pca.head()
