In [1]:
"""
Main Experiment Notebook for Expedia Hotel Booking Prediction

Assignment 2: Data Mining Techniques, Vrije Universiteit Amsterdam
"""

import pandas as pd
import numpy as np
import lightgbm as lgb # Keep for type hints if needed, but direct use will be less
from sklearn.model_selection import GroupKFold
import os

# Import the modularized model functions
import lightgbm_ranker_model as lgbm_model
import warnings # For managing warnings from Optuna/LightGBM if needed

# --- Configuration ---

DATA_DIR = '../data.nosync'
TRAIN_FILE = os.path.join(DATA_DIR, 'train_imputed.csv')
TEST_FILE = os.path.join(DATA_DIR, 'test.csv') # Defined TEST_FILE path
SUBMISSION_FILENAME = 'submission_modular.csv' # Defined submission filename

SAMPLE_FRACTION = 0.1 # Use 10% of the data for faster runs during development
N_FOLDS_CV = 5         # Number of folds for general cross-validation
N_FOLDS_TUNING = 3     # Number of folds for Optuna trials (can be smaller for speed)
N_OPTUNA_TRIALS = 20   # Number of Optuna trials
RANDOM_STATE = 42

# --- 1. Load Data ---
print("Loading training data...")
try:
    df_train_full = pd.read_csv(TRAIN_FILE)
except FileNotFoundError:
    print(f"ERROR: Training file not found at {TRAIN_FILE}")
    df_train_full = None
except Exception as e:
    print(f"Error loading training data: {e}")
    df_train_full = None

# --- 2. Create Relevance Score ---
if df_train_full is not None:
    print("\nCreating relevance score...")
    df_train_full['relevance'] = 0
    df_train_full.loc[df_train_full['click_bool'] == 1, 'relevance'] = 1
    df_train_full.loc[df_train_full['booking_bool'] == 1, 'relevance'] = 2 # Map to 0, 1, 2 for label_gain [0,1,5]
    print("Relevance score distribution:")
    print(df_train_full['relevance'].value_counts())
else:
    print("Skipping relevance score creation as df_train_full is None.")

# --- 3. Data Sampling (Group-aware) ---
df_sample = None
if df_train_full is not None:
    print(f"\nSampling {SAMPLE_FRACTION*100}% of the data based on srch_id...")
    unique_srch_ids = df_train_full['srch_id'].unique()
    if len(unique_srch_ids) > 0:
        sampled_srch_ids_count = int(len(unique_srch_ids) * SAMPLE_FRACTION)
        if sampled_srch_ids_count > 0:
            sampled_srch_ids = np.random.choice(unique_srch_ids, size=sampled_srch_ids_count, replace=False)
            df_sample = df_train_full[df_train_full['srch_id'].isin(sampled_srch_ids)].copy()
            print(f"Sampled data shape: {df_sample.shape}")
        else:
            print("Sample fraction resulted in zero search IDs. Check SAMPLE_FRACTION or dataset size.")
            df_sample = df_train_full.copy() # Fallback to full if sample is too small
            print(f"Using full dataset instead. Shape: {df_sample.shape}")
    else:
        print("No unique search IDs found in the training data.")
    del df_train_full # Optional: free up memory
else:
    print("Skipping sampling as df_train_full is None.")

  from .autonotebook import tqdm as notebook_tqdm


Loading training data...

Creating relevance score...
Relevance score distribution:
relevance
0    4736468
2     138390
1      83489
Name: count, dtype: int64

Sampling 10.0% of the data based on srch_id...
Sampled data shape: (496228, 55)


In [2]:
df_sample

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevance
119,12,2013-03-15 14:55:16,5,219,,,158,7814,3,3.5,...,,,,,,,0,,0,0
120,12,2013-03-15 14:55:16,5,219,,,158,10881,3,3.0,...,,,,,,,0,,0,0
121,12,2013-03-15 14:55:16,5,219,,,158,12510,4,0.0,...,,,,,,,0,,0,0
122,12,2013-03-15 14:55:16,5,219,,,158,17122,4,3.5,...,,,,,,,0,,0,0
123,12,2013-03-15 14:55:16,5,219,,,158,18012,5,4.5,...,,,,,,,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958140,332763,2012-12-04 18:55:04,32,220,,,81,30933,3,2.5,...,,,,,,,0,,0,0
4958141,332763,2012-12-04 18:55:04,32,220,,,81,46986,3,3.0,...,,,,,,,1,156.75,1,2
4958142,332763,2012-12-04 18:55:04,32,220,,,81,62314,4,3.0,...,,,,,,,0,,0,0
4958143,332763,2012-12-04 18:55:04,32,220,,,81,91660,3,0.0,...,,,,,,,0,,0,0


In [3]:
# --- 4. Initial Feature Selection & Preparation ---
X = None
y = None
groups_for_splitting = None # This will be df_sample['srch_id'] for GroupKFold.split
feature_columns = [] # Initialize

if df_sample is not None:
    print("\nDefining initial feature set and preparing X, y, groups...")
    feature_columns = [
        'visitor_location_country_id', 'prop_country_id',
        'prop_starrating', 'prop_review_score', 'prop_brand_bool',
        'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price',
        'price_usd', 'promotion_flag', 'orig_destination_distance'
        # Add more features after EDA and proper missing value handling from EDA notebook
    ]

    # Ensure all selected feature columns exist in df_sample
    existing_feature_columns = [col for col in feature_columns if col in df_sample.columns]
    if len(existing_feature_columns) != len(feature_columns):
        print(f"Warning: Some feature columns not found. Using: {existing_feature_columns}")
    feature_columns = existing_feature_columns

    if not feature_columns:
        print("Error: No feature columns selected or available. Stopping.")
    else:
        X = df_sample[feature_columns].copy()
        y = df_sample['relevance'].copy()
        groups_for_splitting = df_sample['srch_id'] # Used by GroupKFold for splitting

        # Basic Imputation (should ideally be done based on EDA insights and training set stats)
        # This imputation is done on the *sampled* data (X).
        # For test set imputation later, medians from this X will be used.
        print("Performing basic median imputation for numerical features in X...")
        for col in X.columns:
            if X[col].isnull().any():
                if pd.api.types.is_numeric_dtype(X[col]):
                    median_val = X[col].median()
                    X[col].fillna(median_val, inplace=True)
                    # print(f"Imputed NaNs in '{col}' with median: {median_val}")
                # Add mode imputation for categorical if any, or use a placeholder string
        
        print(f"Selected {len(feature_columns)} features: {feature_columns}")
        print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")
        print(f"NaNs remaining in X after imputation: {X.isnull().sum().sum()}")
        if groups_for_splitting is not None:
            print(f"Number of unique groups for splitting: {groups_for_splitting.nunique()}")
else:
    print("Skipping feature selection as df_sample is None.")

# Display X's head to verify
if X is not None:
    display(X.head())


Defining initial feature set and preparing X, y, groups...
Performing basic median imputation for numerical features in X...
Selected 11 features: ['visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'orig_destination_distance']
Shape of X: (496228, 11), Shape of y: (496228,)
NaNs remaining in X after imputation: 0
Number of unique groups for splitting: 19979


Unnamed: 0,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,orig_destination_distance
119,219,158,3,3.5,0,0.0,0.0476,4.46,43.0,1,9270.55
120,219,158,3,3.0,0,3.14,0.2993,4.46,44.0,0,9285.88
121,219,158,4,0.0,0,1.1,0.127639,4.41,51.0,0,9288.15
122,219,158,4,3.5,0,2.64,0.127639,4.22,53.0,0,9278.6
123,219,158,5,4.5,0,3.22,0.1909,5.66,221.0,0,9286.02


In [5]:
# --- 5. Cross-Validation (using modular function) ---
import warnings
warnings.filterwarnings('ignore', message='Found \'eval_at\' in params.*')
mean_cv_ndcg = 0
std_cv_ndcg = 0
cv_feature_importances = pd.Series(dtype=float) # Initialize as an empty Series

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None:
    # Basic LGBM params for initial CV
    # These will be merged with/override defaults in the perform_cross_validation function
    initial_lgbm_params = {
        'n_estimators': 100, # Example: function's default might be different
        'learning_rate': 0.1, # Example
        'random_state': RANDOM_STATE
        # The modular function defines other necessary defaults like objective, metric, label_gain, eval_at etc.
    }
    
    print(f"\n--- Performing {N_FOLDS_CV}-Fold Cross-Validation using modular function ---")
    mean_cv_ndcg, std_cv_ndcg, cv_feature_importances = lgbm_model.perform_cross_validation(
        X, y, 
        groups_for_splitting=groups_for_splitting, # This is df_sample['srch_id']
        df_full_for_group_counts=df_sample, # Pass df_sample, as it contains 'srch_id' needed for group counts
        n_folds=N_FOLDS_CV,
        lgbm_params=initial_lgbm_params
    )
    
    print(f"\nCross-Validation Mean NDCG@5: {mean_cv_ndcg:.4f} +/- {std_cv_ndcg:.4f}")
    if not cv_feature_importances.empty:
        print("\nAverage Feature Importances from CV:")
        with pd.option_context('display.max_rows', 30): # Display top 20 or all if less than 20
            display(cv_feature_importances.head(20))
else:
    print("\nSkipping Cross-Validation due to missing X, y, groups_for_splitting, or df_sample.")


--- Performing 5-Fold Cross-Validation using modular function ---
\n--- Performing 5-Fold Cross-Validation ---
--- Fold 1/5 ---
Fold 1 NDCG@5: 0.3428
--- Fold 2/5 ---
Fold 2 NDCG@5: 0.3423
--- Fold 3/5 ---
Fold 3 NDCG@5: 0.3455
--- Fold 4/5 ---
Fold 4 NDCG@5: 0.3530
--- Fold 5/5 ---
Fold 5 NDCG@5: 0.3536
Mean NDCG@5 across 5 folds: 0.3475 +/- 0.0049

Cross-Validation Mean NDCG@5: 0.3475 +/- 0.0049

Average Feature Importances from CV:


feature
prop_location_score2           15831.198095
price_usd                      11461.798124
prop_location_score1            7153.288665
prop_starrating                 5405.245977
prop_log_historical_price       5379.496554
prop_review_score               2834.064519
promotion_flag                  2247.297013
orig_destination_distance       1730.336422
prop_country_id                 1018.822620
visitor_location_country_id      507.976540
prop_brand_bool                  428.788319
Name: importance, dtype: float64

In [6]:
# --- 6. Hyperparameter Tuning with Optuna (using modular function) ---
best_params_from_tuning = {} # Initialize

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None:
    print(f"\n--- Tuning Hyperparameters with Optuna ({N_OPTUNA_TRIALS} trials, {N_FOLDS_TUNING} CV folds each) using modular function ---")
    
    # Suppress Optuna's verbosity if it's too much, and LightGBM warnings during tuning.
    # import optuna # Optuna is imported within lgbm_model.py where tune_hyperparameters_optuna is defined.
    # optuna.logging.set_verbosity(optuna.logging.WARNING) # You can set this in lgbm_model.py if desired globally for the function
    
    # It's good practice to manage warnings that might clutter the output during tuning.
    # The lgbm_model.py file could also handle these internally if preferred.
    warnings.filterwarnings('ignore', message='Found \'eval_at\' in params.*') # Suppress LightGBM's specific warning
    warnings.filterwarnings('ignore', message='Overriding the init_model argument.*') # Another potential LightGBM warning

    best_params_from_tuning = lgbm_model.tune_hyperparameters_optuna(
        X, 
        y, 
        groups_for_splitting=groups_for_splitting, # This is df_sample['srch_id']
        df_full_for_group_counts=df_sample, # df_sample for calculating group sizes within folds
        n_trials=N_OPTUNA_TRIALS, 
        n_cv_folds=N_FOLDS_TUNING
    )
    
    if best_params_from_tuning:
        print("\nBest parameters found by Optuna:")
        for key, value in best_params_from_tuning.items():
            print(f"    {key}: {value}")
    else:
        print("\nOptuna tuning did not return parameters. Using default parameters for the final model evaluation.")
        # Fallback to some sensible defaults if tuning fails or is skipped
        best_params_from_tuning = { 
            'n_estimators': 200, 'learning_rate': 0.05, 'num_leaves': 31, 
            'max_depth': 7, 'min_child_samples': 20, 'subsample': 0.8,
            'colsample_bytree':0.8, 'reg_alpha':0.1, 'reg_lambda':0.1
            # Add other necessary LGBM parameters if not covered by the module's defaults
        } 
else:
    print("\nSkipping Hyperparameter Tuning due to missing X, y, groups_for_splitting, or df_sample.")
    # Fallback parameters if tuning is skipped
    best_params_from_tuning = { 
        'n_estimators': 200, 'learning_rate': 0.05, 'num_leaves': 31, 
        'max_depth': 7, 'min_child_samples': 20, 'subsample': 0.8,
        'colsample_bytree':0.8, 'reg_alpha':0.1, 'reg_lambda':0.1
    }

# Reset warnings to default behavior if they were changed
warnings.resetwarnings()

[I 2025-05-14 15:11:15,776] A new study created in memory with name: lgbm_ranker_tuning



--- Tuning Hyperparameters with Optuna (20 trials, 3 CV folds each) using modular function ---
\n--- Tuning Hyperparameters with Optuna (20 trials, 3 CV folds each) ---


[I 2025-05-14 15:11:19,215] Trial 0 finished with value: 0.3483959691013827 and parameters: {'n_estimators': 350, 'learning_rate': 0.10148640916203272, 'num_leaves': 26, 'max_depth': 12, 'min_child_samples': 6, 'subsample': 0.5604481619303492, 'colsample_bytree': 0.7128952769140711, 'reg_alpha': 5.632050975812048, 'reg_lambda': 0.01393055140628837}. Best is trial 0 with value: 0.3483959691013827.
[I 2025-05-14 15:11:22,750] Trial 1 finished with value: 0.3487982556281389 and parameters: {'n_estimators': 700, 'learning_rate': 0.07738743799224009, 'num_leaves': 50, 'max_depth': 8, 'min_child_samples': 30, 'subsample': 0.6195409659098858, 'colsample_bytree': 0.9036249138323806, 'reg_alpha': 7.446041659683735, 'reg_lambda': 6.2495614061886355}. Best is trial 1 with value: 0.3487982556281389.
[I 2025-05-14 15:11:24,863] Trial 2 finished with value: 0.35174829820481407 and parameters: {'n_estimators': 400, 'learning_rate': 0.1734440198649952, 'num_leaves': 36, 'max_depth': 4, 'min_child_samp

Optuna study finished. Best trial NDCG@5: 0.3517
Best parameters: {'n_estimators': 400, 'learning_rate': 0.1734440198649952, 'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 58, 'subsample': 0.994009316145284, 'colsample_bytree': 0.7229225094690845, 'reg_alpha': 0.002544358343653164, 'reg_lambda': 0.1930789012781852}

Best parameters found by Optuna:
    n_estimators: 400
    learning_rate: 0.1734440198649952
    num_leaves: 36
    max_depth: 4
    min_child_samples: 58
    subsample: 0.994009316145284
    colsample_bytree: 0.7229225094690845
    reg_alpha: 0.002544358343653164
    reg_lambda: 0.1930789012781852


In [7]:
# --- 7. Train Final Model on Full Sampled Data (using modular function) ---
final_trained_model = None

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None and best_params_from_tuning:
    print("\\n--- Training Final Model with Best Tuned Parameters ---")
    
    # groups_train_full is needed for the lgbm_model.train_final_model function
    # It should represent the group sizes for the entire X, y that's being passed
    # This X is df_sample[feature_columns]
    groups_train_full = df_sample.groupby('srch_id').size().to_numpy()

    if len(groups_train_full) > 0:
        final_trained_model = lgbm_model.train_final_model(
            X_train_full=X,  # This is the full X from df_sample
            y_train_full=y,  # This is the full y from df_sample
            groups_train_full=groups_train_full,
            df_full_for_group_counts=df_sample, # df_sample contains 'srch_id' for early stopping split
            best_params=best_params_from_tuning
        )
        if final_trained_model:
            print("Final model successfully trained.")
        else:
            print("Final model training failed or returned None.")
    else:
        print("Cannot train final model: No groups found in the training data.")
else:
    print("\\nSkipping final model training due to missing data, groups, or best_params_from_tuning.")


\n--- Training Final Model with Best Tuned Parameters ---
\n--- Training Final Model ---
Final model parameters for training:
{'objective': 'lambdarank', 'metric': 'ndcg', 'label_gain': [0, 1, 5], 'eval_at': [5], 'importance_type': 'gain', 'random_state': 42, 'n_jobs': -1, 'verbosity': -1, 'n_estimators': 400, 'learning_rate': 0.1734440198649952, 'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 58, 'subsample': 0.994009316145284, 'colsample_bytree': 0.7229225094690845, 'reg_alpha': 0.002544358343653164, 'reg_lambda': 0.1930789012781852}


  return bound(*args, **kwds)


Fitting final model with early stopping on 90/10 split of training data.
Training until validation scores don't improve for 10 rounds




Early stopping, best iteration is:
[58]	valid_0's ndcg@5: 0.363617
Final model training completed.
Final model successfully trained.


In [9]:
# --- 8. Prepare Test Data and Generate Kaggle Submission (using modular function) ---

if final_trained_model is not None and X is not None and 'feature_columns' in locals() and feature_columns:
    print("\\n--- Preparing Test Data and Generating Kaggle Submission ---")

    # --- 8a. Load Test Data ---
    print(f"Loading test data from: {TEST_FILE}...")
    try:
        df_test_raw = pd.read_csv(TEST_FILE)
        print(f"Loaded test dataset with shape: {df_test_raw.shape}")
    except FileNotFoundError:
        print(f"ERROR: Test file not found at {TEST_FILE}")
        df_test_raw = None
    except Exception as e:
        print(f"Error loading test data: {e}")
        df_test_raw = None

    if df_test_raw is not None:
        # --- 8b. Preprocess Test Data ---
        print("\\nPreprocessing test data...")
        
        # Ensure all selected feature columns exist in df_test_raw
        # and handle any missing columns gracefully if necessary (e.g. by creating them with NaNs)
        X_test_list = []
        for col in feature_columns:
            if col not in df_test_raw.columns:
                print(f"Warning: Feature column '{col}' not found in test data. Creating it with NaNs.")
                df_test_raw[col] = np.nan 
        
        X_test = df_test_raw[feature_columns].copy()

        # Impute missing values in X_test using medians from the TRAINING sample (X)
        # X should be the dataframe of features used for training the final_trained_model
        print("Imputing missing values in test data using training set medians...")
        nan_counts_before_imputation = X_test.isnull().sum()

        for col in X_test.columns:
            if X_test[col].isnull().any():
                if pd.api.types.is_numeric_dtype(X_test[col]):
                    if col in X.columns: # Ensure the column exists in the training features X
                        train_median = X[col].median() # Calculate median from the TRAIN features (X)
                        X_test[col].fillna(train_median, inplace=True)
                        # print(f"Imputed NaNs in test column '{col}' with training median: {train_median}")
                    else:
                        print(f"Warning: Column '{col}' for median imputation not found in training X. Test NaNs may remain.")
                # else: # For categorical, use mode from training X
                    # if col in X.columns:
                    #     train_mode = X[col].mode()[0]
                    #     X_test[col].fillna(train_mode, inplace=True)
                    # else:
                    #     print(f"Warning: Column '{col}' for mode imputation not found in training X. Test NaNs may remain.")
        
        nan_counts_after_imputation = X_test.isnull().sum().sum()
        print(f"NaNs remaining in X_test after imputation: {nan_counts_after_imputation}")
        if nan_counts_after_imputation > 0:
            print("Warning: Some NaNs remain in test features after imputation. Review missing columns or imputation logic.")
            print(X_test.isnull().sum()[X_test.isnull().sum() > 0])


        # --- 8c. Generate Submission File ---
        # The df_test_raw contains 'srch_id' and 'prop_id' needed by the submission function
        lgbm_model.predict_and_format_submission(
            model=final_trained_model,
            X_test=X_test,
            df_test_original_ids=df_test_raw, # Pass the raw test df for srch_id and prop_id
            submission_filename=SUBMISSION_FILENAME
        )
    else:
        print("Skipping submission generation as test data could not be loaded.")
else:
    print("\\nSkipping Kaggle submission: final_trained_model, X, or feature_columns not available.")


\n--- Preparing Test Data and Generating Kaggle Submission ---
Loading test data from: ../data.nosync/test.csv...
Loaded test dataset with shape: (4959183, 50)
\nPreprocessing test data...
Imputing missing values in test data using training set medians...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(train_median, inplace=True)


NaNs remaining in X_test after imputation: 0
\n--- Predicting on Test Data and Formatting Submission ---
Submission file 'submission_modular.csv' created. Top 5 rows:
   SearchId  PropertyId
0         1       54937
1         1       99484
2         1       61934
3         1       24194
4         1       28181
