In [5]:
"""
LambdaMART Model for Expedia Hotel Booking Prediction

Assignment 2: Data Mining Techniques, Vrije Universiteit Amsterdam
"""

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import os

# --- Configuration ---
TRAIN_FILE = '../data/training_set_VU_DM_imputed.csv'

N_FOLDS = 5 # For GroupKFold cross-validation
RANDOM_STATE = 42

# --- 1. Load Data ---
print("Loading training data...")

df_train_full = pd.read_csv(TRAIN_FILE)

# --- 2. Create Relevance Score ---
if df_train_full is not None:
    print("\nCreating relevance score...")
    # 5 for booking, 1 for click (and not booked), 0 otherwise
    df_train_full['relevance'] = 0
    df_train_full.loc[df_train_full['click_bool'] == 1, 'relevance'] = 1
    df_train_full.loc[df_train_full['booking_bool'] == 1, 'relevance'] = 2
    print("Relevance score distribution:")
    print(df_train_full['relevance'].value_counts())
else:
    print("Skipping relevance score creation due to data loading issues.")

# --- 3. Data Sampling (Group-aware) ---
if df_train_full is not None:
    print(f"\nSampling {SAMPLE_FRACTION*100}% of the data based on srch_id...")
    unique_srch_ids = df_train_full['srch_id'].unique()
    sampled_srch_ids = np.random.choice(unique_srch_ids, size=int(len(unique_srch_ids) * SAMPLE_FRACTION), replace=False)

    df_sample = df_train_full[df_train_full['srch_id'].isin(sampled_srch_ids)].copy()
    print(f"Sampled data shape: {df_sample.shape}")
    # Free up memory from the full dataframe if no longer needed for this notebook scope
    # del df_train_full 
else:
    print("Skipping sampling due to data loading issues.")

Loading training data...

Creating relevance score...
Relevance score distribution:
relevance
0    4736468
2     138390
1      83489
Name: count, dtype: int64

Sampling 4.0% of the data based on srch_id...
Sampled data shape: (198439, 55)


In [7]:
df_train_full

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevance
0,1,2013-04-04 08:32:15,12,187,,,219,893,3,3.5,...,,,,0.0,0.0,,0,,0,0
1,1,2013-04-04 08:32:15,12,187,,,219,10404,4,4.0,...,,,,0.0,0.0,,0,,0,0
2,1,2013-04-04 08:32:15,12,187,,,219,21315,3,4.5,...,,,,0.0,0.0,,0,,0,0
3,1,2013-04-04 08:32:15,12,187,,,219,27348,2,4.0,...,,,,-1.0,0.0,5.0,0,,0,0
4,1,2013-04-04 08:32:15,12,187,,,219,29604,4,3.5,...,,,,0.0,0.0,,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,2013-06-30 19:55:18,5,219,,,219,77700,3,4.0,...,,,,,,,0,,0,0
4958343,332785,2013-06-30 19:55:18,5,219,,,219,88083,3,4.0,...,,,,,,,0,,0,0
4958344,332785,2013-06-30 19:55:18,5,219,,,219,94508,3,3.5,...,,,,,,,0,,0,0
4958345,332785,2013-06-30 19:55:18,5,219,,,219,128360,3,5.0,...,,,,,,,1,157.84,1,2


In [8]:
df_sample

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevance
614,52,2013-06-09 09:22:16,16,31,,,215,462,4,4.0,...,0.0,0.0,,,,,0,,0,0
615,52,2013-06-09 09:22:16,16,31,,,215,9193,3,4.0,...,0.0,0.0,,,,,0,,0,0
616,52,2013-06-09 09:22:16,16,31,,,215,15671,4,4.5,...,0.0,0.0,,,,,0,,0,0
617,52,2013-06-09 09:22:16,16,31,,,215,23365,3,3.5,...,,0.0,,,,,0,,0,0
618,52,2013-06-09 09:22:16,16,31,,,215,32237,4,4.5,...,0.0,0.0,,,,,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4957901,332747,2013-03-03 13:03:51,5,219,,,219,124240,4,4.0,...,,,,,,,0,,0,0
4957902,332747,2013-03-03 13:03:51,5,219,,,219,124643,0,2.5,...,,,,,,,0,,0,0
4957903,332747,2013-03-03 13:03:51,5,219,,,219,126405,3,3.5,...,,,,,,,0,,0,0
4957904,332747,2013-03-03 13:03:51,5,219,,,219,128982,3,4.0,...,,,,,,,0,,0,0


In [9]:


# --- 4. Initial Feature Selection (Placeholder) ---
# This will be refined by your friend. For now, using a subset of potentially useful features.
if 'df_sample' in locals() and df_sample is not None:
    print("\nDefining initial feature set...")
    # Features identified as potentially important or commonly used, excluding IDs and target-leaking columns
    # Also excluding competitor columns with high missing rates for now, and user history due to high missingness
    feature_columns = [
        'site_id', 'visitor_location_country_id', 'prop_country_id',
        'prop_starrating', 'prop_review_score', 'prop_brand_bool',
        'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price',
        'price_usd', 'promotion_flag', 'srch_destination_id',
        'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
        'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
        'orig_destination_distance'
        # 'srch_query_affinity_score' # high missingness
        # Add more features here as EDA suggests and after handling missing values
    ]

    # For ranking, we need features (X), relevance (y), and group/query_id
    X = df_sample[feature_columns]
    y = df_sample['relevance']
    groups = df_sample.groupby('srch_id').size().to_numpy() # Size of each group

    print(f"Selected {len(feature_columns)} features.")
    print("Feature columns:", feature_columns)
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
    print(f"Number of groups: {len(groups)}, Min group size: {groups.min()}, Max group size: {groups.max()}")

    # Handle Missing Values (Simple Imputation for now)
    # For a proper model, more sophisticated imputation or feature engineering for missingness is needed.
    print("\nHandling missing values (simple median imputation for numerical)...")
    for col in X.columns:
        if X[col].isnull().any():
            if pd.api.types.is_numeric_dtype(X[col]):
                X[col] = X[col].fillna(X[col].median())
                print(f"Imputed missing values in {col} with median.")
            # else: # For categorical, fill with mode or a specific placeholder
            #     X[col] = X[col].fillna(X[col].mode()[0])

    # Check if any NaNs remain (should ideally be none for numeric after this)
    print("NaNs remaining in X after imputation:", X.isnull().sum().sum())

else:
    print("Skipping feature selection due to data sampling issues.")


Defining initial feature set...
Selected 19 features.
Feature columns: ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'orig_destination_distance']
Shape of X: (198439, 19)
Shape of y: (198439,)
Number of groups: 7991, Min group size: 5, Max group size: 36

Handling missing values (simple median imputation for numerical)...
NaNs remaining in X after imputation: 0


In [10]:
# --- 5. Model Training (LGBMRanker) ---
# This section will contain the training loop using GroupKFold

if 'X' in locals() and 'y' in locals() and df_sample is not None and X is not None: # Use df_sample for srch_id
    print("\n--- 5. Cross-Validation with GroupKFold ---")

    gkf = GroupKFold(n_splits=N_FOLDS)
    
    fold_ndcg_scores = []
    all_feature_importances = pd.DataFrame()

    # The groups parameter for gkf.split should be the srch_id for each row in X
    # It ensures that rows with the same srch_id are not split across train/test in a fold.
    unique_group_ids_for_splitting = df_sample['srch_id']

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=unique_group_ids_for_splitting)):
        print(f"\n--- Fold {fold+1}/{N_FOLDS} ---")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Group counts for the current training and validation fold
        # Need to use the original df_sample with srch_id to correctly form groups for the subsets
        train_groups = df_sample.iloc[train_idx].groupby('srch_id').size().to_numpy()
        val_groups = df_sample.iloc[val_idx].groupby('srch_id').size().to_numpy()
        
        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, Num train groups: {len(train_groups)}")
        print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}, Num val groups: {len(val_groups)}")
        
        if len(train_groups) == 0 or len(val_groups) == 0 or X_train.empty or X_val.empty:
            print("Skipping fold due to empty train or validation groups/data.")
            continue

        ranker_cv = lgb.LGBMRanker(
            objective='lambdarank',
            metric='ndcg',
            label_gain=[0, 1, 5], # Corresponds to relevance 0, 1, 5
            eval_at=[5], # For NDCG@5
            n_estimators=100, 
            learning_rate=0.1,
            importance_type='gain',
            random_state=RANDOM_STATE + fold, 
            n_jobs=-1,
        )

        print(f"Training LGBMRanker for fold {fold+1}...")
        ranker_cv.fit(
            X_train,
            y_train,
            group=train_groups,
            eval_set=[(X_val, y_val)],
            eval_group=[val_groups],
            eval_metric='ndcg', 
            callbacks=[lgb.early_stopping(10, verbose=1)]
        )
        
        if ranker_cv.evals_result_ and 'valid_0' in ranker_cv.evals_result_ and 'ndcg@5' in ranker_cv.evals_result_['valid_0']:
            ndcg_at_5 = ranker_cv.evals_result_['valid_0']['ndcg@5'][-1] 
            fold_ndcg_scores.append(ndcg_at_5)
            print(f"Fold {fold+1} NDCG@5: {ndcg_at_5:.4f}")

            fold_importances = pd.DataFrame({
                'feature': X_train.columns,
                'importance': ranker_cv.feature_importances_,
                'fold': fold + 1
            })
            all_feature_importances = pd.concat([all_feature_importances, fold_importances], ignore_index=True)
        else:
            print(f"Could not retrieve NDCG@5 for fold {fold+1}. Skipping score for this fold.")


    if fold_ndcg_scores:
        print(f"\nMean NDCG@5 across {len(fold_ndcg_scores)} successfully evaluated folds: {np.mean(fold_ndcg_scores):.4f} +/- {np.std(fold_ndcg_scores):.4f}")
        
        if not all_feature_importances.empty:
            mean_feature_importances = all_feature_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)
            print("\nAverage Feature Importances (Cross-Validation):")
            with pd.option_context('display.max_rows', 30):
                display(mean_feature_importances.head(20))
    else:
        print("No folds were successfully processed with NDCG scores.")

else:
    print("\nSkipping model training and cross-validation due to earlier data processing issues.")


# --- 6. Hyperparameter Tuning with RandomizedSearchCV ---
if 'X' in locals() and 'y' in locals() and df_sample is not None and X is not None:
    print("\n--- 6. Hyperparameter Tuning with RandomizedSearchCV ---")
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint as sp_randint
    from scipy.stats import uniform as sp_uniform

    param_dist = {
        'n_estimators': sp_randint(100, 500),
        'learning_rate': sp_uniform(0.01, 0.19), # Upper bound <0.2 for uniform
        'num_leaves': sp_randint(20, 100),      
        'max_depth': sp_randint(3, 12),        
        'min_child_samples': sp_randint(5, 50), 
        'subsample': sp_uniform(0.6, 0.4),      # Sum of loc + scale should be <= 1.0. Here, 0.6 + 0.4 = 1.0
        'colsample_bytree': sp_uniform(0.6, 0.4), 
        'reg_alpha': sp_uniform(0, 1),          
        'reg_lambda': sp_uniform(0, 1),         
    }

    base_ranker = lgb.LGBMRanker(
        objective='lambdarank',
        metric='ndcg', # LightGBM will use this for its internal evaluation
        label_gain=[0, 1, 5],
        eval_at=[5],
        importance_type='gain',
        random_state=RANDOM_STATE,
        n_jobs=-1 # Be cautious with -1 for n_jobs in RandomizedSearchCV if memory is an issue
    )

    gkf_for_tuning = GroupKFold(n_splits=3) # Using 3 splits for tuning to speed it up

    # RandomizedSearchCV setup
    random_search = RandomizedSearchCV(
        estimator=base_ranker,
        param_distributions=param_dist,
        n_iter=10,  # Number of parameter settings sampled. Increase for more thorough search.
                    # Set to a small number like 5-10 for quick test, 25-50 for better search.
        cv=list(gkf_for_tuning.split(X, y, groups=df_sample['srch_id'])), # Pass the list of splits
        random_state=RANDOM_STATE,
        n_jobs=1, # Start with 1 to avoid potential memory issues, then try increasing.
        verbose=2,
        # scoring: If None, estimator's score method is used. LGBMRanker's score method should work.
        # It calculates NDCG@eval_at based on its parameters.
        refit=True # Refits the best estimator on the whole dataset (X,y) passed to fit.
                   # For ranking, this full dataset refit will also need group info.
    )

    print("Starting RandomizedSearchCV for hyperparameter tuning...")
    # Pass `groups` to `fit`. This will be used by GroupKFold inside RandomizedSearchCV.
    # And `LGBMRanker.fit` will also receive this `groups` argument for each fold.
    
    # For early stopping inside RandomizedSearchCV, you'd typically pass fit_params.
    # This is more complex because eval_set/eval_group change per fold.
    # For now, n_estimators is part of the search space.
    
    best_params_from_tuning = {}
    try:
        # RandomizedSearchCV will use the `groups` for splitting via the `cv` object
        # And `LGBMRanker.fit` will receive the `group` parameter for each fold.
        random_search.fit(X, y, groups=df_sample['srch_id']) 
        
        print("\nBest parameters found by RandomizedSearchCV:")
        print(random_search.best_params_)
        # The best_score_ will be based on the internal scoring of LGBMRanker (NDCG@5 here)
        print(f"Best score from RandomizedSearchCV (NDCG@5): {random_search.best_score_:.4f}")
        best_params_from_tuning = random_search.best_params_

    except Exception as e:
        print(f"Error during RandomizedSearchCV: {e}")
        print("Falling back to default parameters for the final model evaluation.")
        # Re-initialize with default in case of error
        best_params_from_tuning = { 
            'n_estimators': 100, 'learning_rate': 0.1, 'num_leaves': 31, 
            'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0,
            'colsample_bytree':1.0, 'reg_alpha':0.0, 'reg_lambda':0.0
        }


else:
    print("\nSkipping hyperparameter tuning due to earlier data processing issues.")
    best_params_from_tuning = {
        'n_estimators': 100, 'learning_rate': 0.1, 'num_leaves': 31, 
        'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0,
        'colsample_bytree':1.0, 'reg_alpha':0.0, 'reg_lambda':0.0
    }


# --- 7. Detailed Evaluation of Best Tuned Model ---
if 'X' in locals() and 'y' in locals() and df_sample is not None and X is not None and best_params_from_tuning:
    print("\n--- 7. Detailed Evaluation of Best Tuned Model (using GroupKFold) ---")

    final_gkf = GroupKFold(n_splits=N_FOLDS)
    final_fold_ndcg_scores = []
    final_all_feature_importances = pd.DataFrame()
    
    # Ensure unique_group_ids_for_splitting is available
    if 'unique_group_ids_for_splitting' not in locals():
        unique_group_ids_for_splitting = df_sample['srch_id']


    final_ranker_params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'label_gain': [0, 1, 5],
        'eval_at': [5],
        'importance_type': 'gain',
        'random_state': RANDOM_STATE,
        'n_jobs': -1,
    }
    # Update with tuned parameters, ensuring they are valid
    for key, value in best_params_from_tuning.items():
        final_ranker_params[key] = value
    
    # Ensure n_estimators is present if not tuned or set to a low value by tuning
    if 'n_estimators' not in final_ranker_params or final_ranker_params['n_estimators'] < 50:
         final_ranker_params['n_estimators'] = 300 # Default if not well-tuned by a short search

    print("\nFinal model parameters for evaluation:")
    print(final_ranker_params)

    for fold, (train_idx, val_idx) in enumerate(final_gkf.split(X, y, groups=unique_group_ids_for_splitting)):
        print(f"\n--- Final Model Evaluation: Fold {fold+1}/{N_FOLDS} ---")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        train_groups = df_sample.iloc[train_idx].groupby('srch_id').size().to_numpy()
        val_groups = df_sample.iloc[val_idx].groupby('srch_id').size().to_numpy()

        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, Num train groups: {len(train_groups)}")
        print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}, Num val groups: {len(val_groups)}")

        if len(train_groups) == 0 or len(val_groups) == 0 or X_train.empty or X_val.empty:
            print("Skipping fold due to empty train or validation groups/data.")
            continue
            
        final_ranker = lgb.LGBMRanker(**final_ranker_params)

        print(f"Training final tuned LGBMRanker for fold {fold+1}...")
        final_ranker.fit(
            X_train,
            y_train,
            group=train_groups,
            eval_set=[(X_val, y_val)],
            eval_group=[val_groups],
            eval_metric='ndcg',
            callbacks=[lgb.early_stopping(10, verbose=1)]
        )
        
        if final_ranker.evals_result_ and 'valid_0' in final_ranker.evals_result_ and 'ndcg@5' in final_ranker.evals_result_['valid_0']:
            final_ndcg_at_5 = final_ranker.evals_result_['valid_0']['ndcg@5'][-1]
            final_fold_ndcg_scores.append(final_ndcg_at_5)
            print(f"Fold {fold+1} (Tuned Model) NDCG@5: {final_ndcg_at_5:.4f}")

            fold_importances = pd.DataFrame({
                'feature': X_train.columns,
                'importance': final_ranker.feature_importances_,
                'fold': fold + 1
            })
            final_all_feature_importances = pd.concat([final_all_feature_importances, fold_importances], ignore_index=True)
        else:
            print(f"Could not retrieve NDCG@5 for fold {fold+1} of the tuned model.")


    if final_fold_ndcg_scores:
        print(f"\nMean NDCG@5 for Tuned Model across {len(final_fold_ndcg_scores)} successfully evaluated folds: {np.mean(final_fold_ndcg_scores):.4f} +/- {np.std(final_fold_ndcg_scores):.4f}")
        
        if not final_all_feature_importances.empty:
            final_mean_feature_importances = final_all_feature_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)
            print("\nAverage Feature Importances (Tuned Model):")
            with pd.option_context('display.max_rows', 30):
                display(final_mean_feature_importances.head(20))
    else:
        print("No folds were successfully processed for the final tuned model evaluation.")
        
else:
    print("\nSkipping final model evaluation due to earlier issues or no tuned parameters found.")

# --- End of Notebook ---
# Next steps would involve:
# 1. More sophisticated feature engineering and selection.
# 2. Training the best model on the full (sampled) data or even a larger fraction.
# 3. Preparing the test data similarly.
# 4. Generating predictions for the test set.
# 5. Creating the Kaggle submission file.


--- 5. Cross-Validation with GroupKFold ---

--- Fold 1/5 ---
X_train shape: (158752, 19), y_train shape: (158752,), Num train groups: 6393
X_val shape: (39687, 19), y_val shape: (39687,), Num val groups: 1598
Training LGBMRanker for fold 1...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2104
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	valid_0's ndcg@5: 0.346871
Fold 1 NDCG@5: 0.3379

--- Fold 2/5 ---
X_train shape: (158752, 19), y_train shape: (158752,), Num train groups: 6393
X_val shape: (39687, 19), y_val shape: (39687,), Num val groups: 1598
Training LGBMRanker for fold 2...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2101
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[16]	valid_0's ndcg@5: 0.335688
Fold 2 NDCG@5: 0.3328

--- Fold 3/5 ---
X_train shape: (158752, 19), y_train shape: (158752,), Num train groups: 6393
X_val shape: (39687, 19), y_val shape: (39687,), Num val groups: 1598
Training LGBMRanker for fold 3...




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2107
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[35]	valid_0's ndcg@5: 0.346766
Fold 3 NDCG@5: 0.3407

--- Fold 4/5 ---
X_train shape: (158752, 19), y_train shape: (158752,), Num train groups: 6393
X_val shape: (39687, 19), y_val shape: (39687,), Num val groups: 1598
Training LGBMRanker for fold 4...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006223 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2112
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[16]	valid_0's ndcg@5: 0.339617
Fold 4 NDCG@5: 0.3379

--- Fold 5/5 ---
X_train shape: (158748, 19), y_train shape: (158748,), Num train groups: 6392
X_val shape: (39691, 19), y_val shape: (39691,), Num val groups: 1599
Training LGBMRanker for fold 5...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2110
[LightGBM] [Info] Number of data points in the train set: 158748, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[32]	valid_0's ndcg@5: 0.338168
Fold 5 NDCG@5: 0.3339

Mean NDCG@5 across 5 successfully evaluated folds: 0.3366 +/- 0.0029

Average Feature Importances (Cross-Validation):


feature
prop_location_score2           6425.485961
price_usd                      4805.286522
prop_location_score1           2768.727621
prop_log_historical_price      2192.717624
prop_starrating                1917.017336
prop_review_score              1067.955743
orig_destination_distance       948.177619
promotion_flag                  916.317781
srch_destination_id             592.740678
srch_booking_window             541.364219
prop_country_id                 491.710098
srch_length_of_stay             376.001439
visitor_location_country_id     237.730740
site_id                         220.742760
prop_brand_bool                 131.627481
srch_adults_count                73.448220
srch_children_count              56.576361
srch_room_count                  40.845460
srch_saturday_night_bool         34.889260
Name: importance, dtype: float64


--- 6. Hyperparameter Tuning with RandomizedSearchCV ---
Starting RandomizedSearchCV for hyperparameter tuning...
Error during RandomizedSearchCV: If no scoring is specified, the estimator passed should have a 'score' method. The estimator LGBMRanker(eval_at=[5], importance_type='gain', label_gain=[0, 1, 5],
           metric='ndcg', n_jobs=-1, objective='lambdarank', random_state=42) does not.
Falling back to default parameters for the final model evaluation.

--- 7. Detailed Evaluation of Best Tuned Model (using GroupKFold) ---

Final model parameters for evaluation:
{'objective': 'lambdarank', 'metric': 'ndcg', 'label_gain': [0, 1, 5], 'eval_at': [5], 'importance_type': 'gain', 'random_state': 42, 'n_jobs': -1, 'n_estimators': 100, 'learning_rate': 0.1, 'num_leaves': 31, 'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.0, 'reg_lambda': 0.0}

--- Final Model Evaluation: Fold 1/5 ---
X_train shape: (158752, 19), y_train shape: (15875



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2104
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	valid_0's ndcg@5: 0.346871
Fold 1 (Tuned Model) NDCG@5: 0.3379

--- Final Model Evaluation: Fold 2/5 ---
X_train shape: (158752, 19), y_train shape: (158752,), Num train groups: 6393
X_val shape: (39687, 19), y_val shape: (39687,), Num val groups: 1598
Training final tuned LGBMRanker for fold 2...




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2101
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[16]	valid_0's ndcg@5: 0.335688
Fold 2 (Tuned Model) NDCG@5: 0.3328

--- Final Model Evaluation: Fold 3/5 ---
X_train shape: (158752, 19), y_train shape: (158752,), Num train groups: 6393
X_val shape: (39687, 19), y_val shape: (39687,), Num val groups: 1598
Training final tuned LGBMRanker for fold 3...




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012645 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2107
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[35]	valid_0's ndcg@5: 0.346766
Fold 3 (Tuned Model) NDCG@5: 0.3407

--- Final Model Evaluation: Fold 4/5 ---
X_train shape: (158752, 19), y_train shape: (158752,), Num train groups: 6393
X_val shape: (39687, 19), y_val shape: (39687,), Num val groups: 1598
Training final tuned LGBMRanker for fold 4...




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2112
[LightGBM] [Info] Number of data points in the train set: 158752, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[16]	valid_0's ndcg@5: 0.339617
Fold 4 (Tuned Model) NDCG@5: 0.3379

--- Final Model Evaluation: Fold 5/5 ---
X_train shape: (158748, 19), y_train shape: (158748,), Num train groups: 6392
X_val shape: (39691, 19), y_val shape: (39691,), Num val groups: 1599
Training final tuned LGBMRanker for fold 5...




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2110
[LightGBM] [Info] Number of data points in the train set: 158748, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[32]	valid_0's ndcg@5: 0.338168
Fold 5 (Tuned Model) NDCG@5: 0.3339

Mean NDCG@5 for Tuned Model across 5 successfully evaluated folds: 0.3366 +/- 0.0029

Average Feature Importances (Tuned Model):


feature
prop_location_score2           6425.485961
price_usd                      4805.286522
prop_location_score1           2768.727621
prop_log_historical_price      2192.717624
prop_starrating                1917.017336
prop_review_score              1067.955743
orig_destination_distance       948.177619
promotion_flag                  916.317781
srch_destination_id             592.740678
srch_booking_window             541.364219
prop_country_id                 491.710098
srch_length_of_stay             376.001439
visitor_location_country_id     237.730740
site_id                         220.742760
prop_brand_bool                 131.627481
srch_adults_count                73.448220
srch_children_count              56.576361
srch_room_count                  40.845460
srch_saturday_night_bool         34.889260
Name: importance, dtype: float64

In [11]:
# --- 8. Prepare Test Data and Generate Kaggle Submission ---

if 'df_sample' in locals() and df_sample is not None and \
   'X' in locals() and X is not None and \
   'best_params_from_tuning' in locals() and best_params_from_tuning:

    print("\n--- 8. Test Data Preparation and Kaggle Submission ---")

    # --- 8a. Load Test Data ---
    TEST_FILE = '../data/test_set_VU_DM.csv'
    print("Loading test data...")
    try:

        df_test = pd.read_csv(TEST_FILE)
        print(f"Loaded test dataset with shape: {df_test.shape}")
    except FileNotFoundError:
        print(f"Error: Test file not found at {TEST_FILE}")
        df_test = None
    except Exception as e:
        print(f"An error occurred during test data loading: {e}")
        df_test = None

    if df_test is not None:
        # --- 8b. Preprocess Test Data ---
        print("\nPreprocessing test data...")
        # Use the same feature_columns as defined for training
        if 'feature_columns' not in locals() or not feature_columns:
            print("Error: feature_columns not defined. Cannot preprocess test data.")
            df_test_processed = None
        else:
            print(f"Using feature columns: {feature_columns}")
            X_test = df_test[feature_columns].copy() # Ensure it's a copy

            # Impute missing values using medians from the TRAINING sample (X)
            # This is crucial to prevent data leakage.
            print("Imputing missing values in test data using training set medians...")
            for col in X_test.columns:
                if X_test[col].isnull().any():
                    if pd.api.types.is_numeric_dtype(X_test[col]):
                        # Get median from the original X (training sample before it was split into folds)
                        train_median = X[col].median() # X should be the full sample used for training/tuning
                        X_test[col] = X_test[col].fillna(train_median)
                        # print(f"Imputed missing values in test column {col} with training median: {train_median}")
                    # else: # For categorical, use mode from training set
                    #     train_mode = X[col].mode()[0]
                    #     X_test[col] = X_test[col].fillna(train_mode)
            
            print("NaNs remaining in X_test after imputation:", X_test.isnull().sum().sum())
            df_test_processed = True


        if df_test_processed:
            # --- 8c. Train Final Model on Full Sampled Data (df_sample) ---
            print("\nTraining final model on the full sampled training data (df_sample)...")
            
            # Parameters for the final model
            final_model_params = {
                'objective': 'lambdarank',
                'metric': 'ndcg',
                'label_gain': [0, 1, 5], # If using remapped (0,1,2) relevance, gain still [0,1,5]
                'eval_at': [5],
                'importance_type': 'gain',
                'random_state': RANDOM_STATE,
                'n_jobs': -1,
            }
            final_model_params.update(best_params_from_tuning)
            if 'n_estimators' not in final_model_params or final_model_params['n_estimators'] < 50:
                final_model_params['n_estimators'] = 300 # A reasonable default for early stopping
            
            print("Final model parameters for prediction model:")
            print(final_model_params)

            # Data for final model training
            X_full_sample = X # This is df_sample[feature_columns] with imputations
            y_full_sample = y # This is df_sample['relevance'] (remapped to 0,1,2 if you implemented that)
            groups_full_sample = df_sample.groupby('srch_id').size().to_numpy()

            final_model = lgb.LGBMRanker(**final_model_params)
            
            # For the final model, we can use a small portion of df_sample as an eval set for early stopping
            # This is better than no early stopping.
            temp_df_for_final_split = pd.DataFrame({
                'srch_id': df_sample['srch_id'],
                'index_orig': df_sample.index
            }).drop_duplicates(subset=['srch_id'])

            final_train_srch_ids, final_val_srch_ids = np.split(
                temp_df_for_final_split['srch_id'].sample(frac=1, random_state=RANDOM_STATE),
                [int(0.9 * len(temp_df_for_final_split))] # 90/10 split for final model's early stopping
            )
            
            final_train_indices = df_sample[df_sample['srch_id'].isin(final_train_srch_ids)].index
            final_val_indices = df_sample[df_sample['srch_id'].isin(final_val_srch_ids)].index

            X_final_train, X_final_val = X_full_sample.loc[final_train_indices], X_full_sample.loc[final_val_indices]
            y_final_train, y_final_val = y_full_sample.loc[final_train_indices], y_full_sample.loc[final_val_indices]
            
            groups_final_train = df_sample.loc[final_train_indices].groupby('srch_id').size().to_numpy()
            groups_final_val = df_sample.loc[final_val_indices].groupby('srch_id').size().to_numpy()

            if not X_final_val.empty and len(groups_final_val) > 0:
                 print(f"Fitting final model on {len(X_final_train)} samples, validating on {len(X_final_val)} samples.")
                 final_model.fit(
                    X_final_train, y_final_train, group=groups_final_train,
                    eval_set=[(X_final_val, y_final_val)],
                    eval_group=[groups_final_val],
                    eval_metric='ndcg',
                    callbacks=[lgb.early_stopping(10, verbose=1)]
                )
            else: # Fallback if validation set is too small or problematic
                print("Validation set for final model is empty/problematic, fitting on all sampled data without early stopping.")
                final_model_params.pop('eval_set', None) # Remove eval params if not using
                final_model_params.pop('eval_group', None)
                final_model_params.pop('eval_metric', None)
                final_model_params.pop('callbacks', None) # No early stopping
                # Ensure n_estimators is set to a fixed number if no early stopping
                final_model_params['n_estimators'] = best_params_from_tuning.get('n_estimators', 300) # Use tuned or default
                final_model = lgb.LGBMRanker(**final_model_params)
                final_model.fit(X_full_sample, y_full_sample, group=groups_full_sample)

            print("Final model training completed.")

            # --- 8d. Make Predictions on Test Data ---
            print("\nMaking predictions on the test set...")
            test_predictions = final_model.predict(X_test)
            df_test['predicted_score'] = test_predictions

            # --- 8e. Format Predictions for Submission ---
            print("\nFormatting predictions for submission...")
            submission_list = []
            # Group by srch_id and sort by predicted_score
            for srch_id, group_df in df_test.groupby('srch_id'):
                # Sort properties within each search by the predicted score in descending order
                ranked_properties = group_df.sort_values('predicted_score', ascending=False)
                for _, row in ranked_properties.iterrows():
                    submission_list.append({'srch_id': int(row['srch_id']), 'prop_id': int(row['prop_id'])})
            
            df_submission = pd.DataFrame(submission_list)

            # --- 8f. Create Submission File ---
            SUBMISSION_FILE = '../data/submission.csv'
            df_submission.to_csv(SUBMISSION_FILE, index=False)
            print(f"\nSubmission file '{SUBMISSION_FILE}' created successfully.")
            print(df_submission.head())

        else:
            print("Skipping prediction and submission due to test data processing issues.")
    else:
        print("Skipping submission generation due to test data loading issues.")
else:
    print("\nSkipping Kaggle submission part: Prerequisite data (df_sample, X, best_params_from_tuning) not available.")



--- 8. Test Data Preparation and Kaggle Submission ---
Loading test data...
Loaded test dataset with shape: (4959183, 50)

Preprocessing test data...
Using feature columns: ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'orig_destination_distance']
Imputing missing values in test data using training set medians...
NaNs remaining in X_test after imputation: 0

Training final model on the full sampled training data (df_sample)...
Final model parameters for prediction model:
{'objective': 'lambdarank', 'metric': 'ndcg', 'label_gain': [0, 1, 5], 'eval_at': [5], 'importance_type': 'gain', 'random_state': 42, 'n_jobs': -1, 'n_estimators': 100, 'learning_r

  return bound(*args, **kwds)


Fitting final model on 178864 samples, validating on 19575 samples.




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2117
[LightGBM] [Info] Number of data points in the train set: 178864, number of used features: 19
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[5]	valid_0's ndcg@5: 0.349422
Final model training completed.

Making predictions on the test set...





Formatting predictions for submission...

Submission file 'submission.csv' created successfully.
   srch_id  prop_id
0        1    54937
1        1    61934
2        1    99484
3        1   123675
4        1    63894
