In [3]:
# main_notebook.ipynb

# ==========================
# 1. Setup & Imports
# ==========================
import pandas as pd

from src.setup import get_base
from src.feature_engineering import engineer_features
from src.analysis import run_pca, run_kmeans, plot_pca_clusters

# ==========================
# 2. Daten laden
# ==========================
df = get_base()

print("Rohdaten:")
display(df.head())

# ==========================
# 3. Feature Engineering
# ==========================
df_features = engineer_features(df)

print("Feature-Daten:")
display(df_features.head())

# ==========================
# 4. PCA
# ==========================
pca_df, pca, explained = run_pca(df_features, n_components=2)
print("Varianzanteile der PCA:", explained)

# ==========================
# 5. Clustering
# ==========================
df_clusters, kmeans, sil = run_kmeans(df_features, n_clusters=4)
print("Silhouette Score:", sil)

# Cluster-Ergebnis ansehen
display(df_clusters[["cluster"]].value_counts())

# ==========================
# 6. Visualisierung
# =========================

Rohdaten:


Unnamed: 0,session_id,user_id,trip_id,session_start,session_end,flight_discount,hotel_discount,flight_discount_amount,hotel_discount_amount,flight_booked,...,trip_airline,destination_airport_lat,destination_airport_lon,base_fare_usd,hotel_name,nights,rooms,check_in_time,check_out_time,hotel_price_per_room_night_usd
0,101486-6d053e0f51884dddb339416c86d5b3a9,101486,,2023-07-18 09:35:00,2023-07-18 09:35:22,False,False,,,False,...,,,,,,,,NaT,NaT,
1,101486-6de2ac91a8c24619a69a3d3ae7c28d61,101486,,2023-03-21 10:16:00,2023-03-21 10:18:38,False,False,,,False,...,,,,,,,,NaT,NaT,
2,101486-7e8b450e5fe94345bd6c1ae9f479f073,101486,,2023-02-11 17:05:00,2023-02-11 17:07:58,False,False,,,False,...,,,,,,,,NaT,NaT,
3,101486-9e7d5ccc7e034bf7b7a5ad2f3befbe13,101486,,2023-05-30 20:42:00,2023-05-30 20:44:15,True,False,0.1,,False,...,,,,,,,,NaT,NaT,
4,101486-be6d02b11839441aa4107044d095cb99,101486,,2023-03-06 21:50:00,2023-03-06 21:50:53,False,False,,,False,...,,,,,,,,NaT,NaT,


Feature-Daten:


Unnamed: 0_level_0,avg_flight_discount,total_flight_discount,avg_hotel_discount,total_hotel_discount,total_sessions,avg_session_duration,total_session_duration,total_flights_booked,total_hotels_booked,avg_flight_fare_usd,total_flight_fare_usd,n_unique_destinations
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
23557,0.0,0,0.25,2,8,1.277083,10.216667,0,2,0.0,0.0,0
94883,0.0,0,0.125,1,8,1.129167,9.033333,2,2,432.045,864.09,2
101486,0.25,2,0.0,0,8,2.0375,16.3,1,2,189.91,189.91,1
101961,0.25,2,0.125,1,8,1.9625,15.7,5,5,248.532,1242.66,5
106907,0.125,1,0.125,1,8,12.648584,101.188675,2,2,13902.06,27804.12,1


Varianzanteile der PCA: [0.31142207 0.19772711]
Silhouette Score: 0.20046087097374704


cluster
3          2113
0          1882
2          1521
1           482
Name: count, dtype: int64

In [None]:
from src.analysis import find_optimal_k

# ==========================
# 5. Optimale Clusterzahl finden
# ==========================
results = find_optimal_k(df_features, k_min=2, k_max=10)

print("Silhouette Scores für verschiedene k:")
for k, sil in results:
    print(f"k={k}: Silhouette={sil:.3f}")

# Bestes k finden
best_k, best_sil = max(results, key=lambda x: x[1])
print(f"\nOptimale Clusterzahl: k={best_k} mit Silhouette={best_sil:.3f}")

# ==========================
# 6. Clustering mit best_k
# ==========================
df_clusters, kmeans, sil = run_kmeans(df_features, n_clusters=best_k)
plot_pca_clusters(pca_df, df_clusters["cluster"])

ModuleNotFoundError: No module named 'analysis'