In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
data = pd.read_csv(r"../data/cleaned_data.csv")
data

In [None]:
# features = ["DayOfWeekEncoded", "DepHour", "CRSArrHour", "Distance", "TaxiOut", "DepHour_DayOfWeek", "DepHour_UniqueCarrier", "CRSArrHour_DayOfWeek", "Origin_Dep_Count", "Dest_Arr_Count", "Month"]
features = ["DayOfWeekEncoded", "Month", "DepTimeMinutes", "DepHour", "CRSDepTimeMinutes", "CRSDepHour", "ArrTimeMinutes", "ArrHour", "CRSArrTimeMinutes", "CRSArrHour", "Distance", "ActualElapsedTime", "CRSElapsedTime", "AirTime", "TaxiOut", "TaxiIn", "Origin_Dep_Count", "Dest_Arr_Count"]
features += [col for col in data.columns if col.startswith("UniqueCarrier_")]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data[features])

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=777)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    print(f"WCSS for {i} clusters: ", kmeans.inertia_)

from sklearn.metrics import silhouette_score

silhouette_scores = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, random_state=777)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print(f"Silhouette Score for {i} clusters: ", silhouette_avg)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker="o")
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.savefig(r"../figures/_kmeans_wcss.png")
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker="o")
plt.title("Silhouette Scores")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.savefig(r"../figures/_kmeans_silhouette_score.png")
plt.show()

In [None]:
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=777)
cluster_labels = kmeans.fit_predict(X_scaled)

In [None]:
data["Cluster"] = cluster_labels

In [None]:
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_centers_df = pd.DataFrame(cluster_centers, columns=features)

print("Cluster Centers:")
print(cluster_centers_df)

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="DepTimeMinutes", y="ArrTimeMinutes", hue="Cluster", palette="viridis")
plt.title("Time of Departure vs Time of Arrival (in Minutes since Midnight)")
plt.xlabel("DepTimeMinutes")
plt.ylabel("ArrTimeMinutes")
plt.legend(title="Cluster")
plt.savefig(r"../figures/_cluster_deptimeminutes_arrtimeminutes.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="Distance", y="AirTime", hue="Cluster", palette="viridis")
plt.title("Distance vs AirTime")
plt.xlabel("Distance")
plt.ylabel("AirTime")
plt.legend(title="Cluster")
plt.savefig(r"../figures/_cluster_distance_airtime.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="Distance", y="ArrTimeMinutes", hue="Cluster", palette="viridis")
plt.title("Distance vs Time of Arrival (in Minutes since Midnight)")
plt.xlabel("Distance")
plt.ylabel("ArrTimeMinutes")
plt.legend(title="Cluster")
plt.savefig(r"../figures/_cluster_distance_arrtimeminutes.png")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(data=data, x="Distance", y="DepTimeMinutes", hue="Cluster", palette="viridis")
plt.title("Distance vs Time of Departure (in Minutes since Midnight)")
plt.xlabel("Distance")
plt.ylabel("DepTimeMinutes")
plt.legend(title="Cluster")
plt.savefig(r"../figures/_cluster_distance_deptimeminutes.png")
plt.show()