In [1]:
"""
Main Experiment Notebook for Expedia Hotel Booking Prediction

Assignment 2: Data Mining Techniques, Vrije Universiteit Amsterdam
"""

import pandas as pd
import numpy as np
import lightgbm as lgb # Keep for type hints if needed, but direct use will be less
from sklearn.model_selection import GroupKFold
import os

# Import the modularized model functions
import lightgbm_ranker_model as lgbm_model
import warnings # For managing warnings from Optuna/LightGBM if needed

# --- Configuration ---

DATA_DIR = '../data.nosync'
TRAIN_FILE = os.path.join(DATA_DIR, 'train_imputed.csv')
TEST_FILE = os.path.join(DATA_DIR, 'test.csv') # Defined TEST_FILE path
SUBMISSION_FILENAME = 'submission_modular.csv' # Defined submission filename

SAMPLE_FRACTION = 0.1 # Use 10% of the data for faster runs during development
N_FOLDS_CV = 5         # Number of folds for general cross-validation
N_FOLDS_TUNING = 3     # Number of folds for Optuna trials (can be smaller for speed)
N_OPTUNA_TRIALS = 20   # Number of Optuna trials
RANDOM_STATE = 42

# --- 1. Load Data ---
print("Loading training data...")
try:
    df_train_full = pd.read_csv(TRAIN_FILE)
except FileNotFoundError:
    print(f"ERROR: Training file not found at {TRAIN_FILE}")
    df_train_full = None
except Exception as e:
    print(f"Error loading training data: {e}")
    df_train_full = None

# --- 2. Create Relevance Score ---
if df_train_full is not None:
    print("\nCreating relevance score...")
    df_train_full['relevance'] = 0
    df_train_full.loc[df_train_full['click_bool'] == 1, 'relevance'] = 1
    df_train_full.loc[df_train_full['booking_bool'] == 1, 'relevance'] = 2 # Map to 0, 1, 2 for label_gain [0,1,5]
    print("Relevance score distribution:")
    print(df_train_full['relevance'].value_counts())
else:
    print("Skipping relevance score creation as df_train_full is None.")

# --- 3. Data Sampling (Group-aware) ---
df_sample = None
if df_train_full is not None:
    print(f"\nSampling {SAMPLE_FRACTION*100}% of the data based on srch_id...")
    unique_srch_ids = df_train_full['srch_id'].unique()
    if len(unique_srch_ids) > 0:
        sampled_srch_ids_count = int(len(unique_srch_ids) * SAMPLE_FRACTION)
        if sampled_srch_ids_count > 0:
            sampled_srch_ids = np.random.choice(unique_srch_ids, size=sampled_srch_ids_count, replace=False)
            df_sample = df_train_full[df_train_full['srch_id'].isin(sampled_srch_ids)].copy()
            print(f"Sampled data shape: {df_sample.shape}")
        else:
            print("Sample fraction resulted in zero search IDs. Check SAMPLE_FRACTION or dataset size.")
            df_sample = df_train_full.copy() # Fallback to full if sample is too small
            print(f"Using full dataset instead. Shape: {df_sample.shape}")
    else:
        print("No unique search IDs found in the training data.")
    del df_train_full # Optional: free up memory
else:
    print("Skipping sampling as df_train_full is None.")

  from .autonotebook import tqdm as notebook_tqdm


Loading training data...

Creating relevance score...
Relevance score distribution:
relevance
0    4736468
2     138390
1      83489
Name: count, dtype: int64

Sampling 10.0% of the data based on srch_id...
Sampled data shape: (496228, 55)


In [9]:
"""
Main Experiment Notebook for Expedia Hotel Booking Prediction

Assignment 2: Data Mining Techniques, Vrije Universiteit Amsterdam

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import os

# --- Configuration ---

DATA_DIR = '../data.nosync'
TRAIN_FILE = os.path.join(DATA_DIR, 'train_imputed.csv')

SAMPLE_FRACTION = 0.1 # Use 4% of the data
N_FOLDS = 5 # For GroupKFold cross-validation
RANDOM_STATE = 42

# --- 1. Load Data ---
print("Loading training data...")

df_train_full = pd.read_csv(TRAIN_FILE)

# --- 2. Create Relevance Score ---
if df_train_full is not None:
    print("\nCreating relevance score...")
    # 5 for booking, 1 for click (and not booked), 0 otherwise
    df_train_full['relevance'] = 0
    df_train_full.loc[df_train_full['click_bool'] == 1, 'relevance'] = 1
    df_train_full.loc[df_train_full['booking_bool'] == 1, 'relevance'] = 2
    print("Relevance score distribution:")
    print(df_train_full['relevance'].value_counts())
else:
    print("Skipping relevance score creation due to data loading issues.")


# --- 3. Data Sampling (Group-aware) ---
if df_train_full is not None:
    print(f"\nSampling {SAMPLE_FRACTION*100}% of the data based on srch_id...")
    unique_srch_ids = df_train_full['srch_id'].unique()
    sampled_srch_ids = np.random.choice(unique_srch_ids, size=int(len(unique_srch_ids) * SAMPLE_FRACTION), replace=False)

    df_sample = df_train_full[df_train_full['srch_id'].isin(sampled_srch_ids)].copy()
    print(f"Sampled data shape: {df_sample.shape}")
    # Free up memory from the full dataframe if no longer needed for this notebook scope
    # del df_train_full 
else:
    print("Skipping sampling due to data loading issues.")

"""



Loading training data...

Creating relevance score...
Relevance score distribution:
relevance
0    4736468
2     138390
1      83489
Name: count, dtype: int64

Sampling 10.0% of the data based on srch_id...
Sampled data shape: (496212, 55)


In [2]:
df_sample

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,click_bool,gross_bookings_usd,booking_bool,relevance
119,12,2013-03-15 14:55:16,5,219,,,158,7814,3,3.5,...,,,,,,,0,,0,0
120,12,2013-03-15 14:55:16,5,219,,,158,10881,3,3.0,...,,,,,,,0,,0,0
121,12,2013-03-15 14:55:16,5,219,,,158,12510,4,0.0,...,,,,,,,0,,0,0
122,12,2013-03-15 14:55:16,5,219,,,158,17122,4,3.5,...,,,,,,,0,,0,0
123,12,2013-03-15 14:55:16,5,219,,,158,18012,5,4.5,...,,,,,,,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958140,332763,2012-12-04 18:55:04,32,220,,,81,30933,3,2.5,...,,,,,,,0,,0,0
4958141,332763,2012-12-04 18:55:04,32,220,,,81,46986,3,3.0,...,,,,,,,1,156.75,1,2
4958142,332763,2012-12-04 18:55:04,32,220,,,81,62314,4,3.0,...,,,,,,,0,,0,0
4958143,332763,2012-12-04 18:55:04,32,220,,,81,91660,3,0.0,...,,,,,,,0,,0,0


In [3]:
# --- 4. Initial Feature Selection & Preparation ---
X = None
y = None
groups_for_splitting = None # This will be df_sample['srch_id'] for GroupKFold.split
feature_columns = [] # Initialize

if df_sample is not None:
    print("\nDefining initial feature set and preparing X, y, groups...")
    feature_columns = [
        'visitor_location_country_id', 'prop_country_id',
        'prop_starrating', 'prop_review_score', 'prop_brand_bool',
        'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price',
        'price_usd', 'promotion_flag', 'orig_destination_distance'
        # Add more features after EDA and proper missing value handling from EDA notebook
    ]

    # Ensure all selected feature columns exist in df_sample
    existing_feature_columns = [col for col in feature_columns if col in df_sample.columns]
    if len(existing_feature_columns) != len(feature_columns):
        print(f"Warning: Some feature columns not found. Using: {existing_feature_columns}")
    feature_columns = existing_feature_columns

    if not feature_columns:
        print("Error: No feature columns selected or available. Stopping.")
    else:
        X = df_sample[feature_columns].copy()
        y = df_sample['relevance'].copy()
        groups_for_splitting = df_sample['srch_id'] # Used by GroupKFold for splitting

        # Basic Imputation (should ideally be done based on EDA insights and training set stats)
        # This imputation is done on the *sampled* data (X).
        # For test set imputation later, medians from this X will be used.
        print("Performing basic median imputation for numerical features in X...")
        for col in X.columns:
            if X[col].isnull().any():
                if pd.api.types.is_numeric_dtype(X[col]):
                    median_val = X[col].median()
                    X[col].fillna(median_val, inplace=True)
                    # print(f"Imputed NaNs in '{col}' with median: {median_val}")
                # Add mode imputation for categorical if any, or use a placeholder string
        
        print(f"Selected {len(feature_columns)} features: {feature_columns}")
        print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")
        print(f"NaNs remaining in X after imputation: {X.isnull().sum().sum()}")
        if groups_for_splitting is not None:
            print(f"Number of unique groups for splitting: {groups_for_splitting.nunique()}")
else:
    print("Skipping feature selection as df_sample is None.")

# Display X's head to verify
if X is not None:
    display(X.head())


Defining initial feature set and preparing X, y, groups...
Performing basic median imputation for numerical features in X...
Selected 11 features: ['visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'orig_destination_distance']
Shape of X: (496228, 11), Shape of y: (496228,)
NaNs remaining in X after imputation: 0
Number of unique groups for splitting: 19979


Unnamed: 0,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,orig_destination_distance
119,219,158,3,3.5,0,0.0,0.0476,4.46,43.0,1,9270.55
120,219,158,3,3.0,0,3.14,0.2993,4.46,44.0,0,9285.88
121,219,158,4,0.0,0,1.1,0.127639,4.41,51.0,0,9288.15
122,219,158,4,3.5,0,2.64,0.127639,4.22,53.0,0,9278.6
123,219,158,5,4.5,0,3.22,0.1909,5.66,221.0,0,9286.02


In [5]:
# --- 5. Cross-Validation (using modular function) ---
import warnings
warnings.filterwarnings('ignore', message='Found \'eval_at\' in params.*')
mean_cv_ndcg = 0
std_cv_ndcg = 0
cv_feature_importances = pd.Series(dtype=float) # Initialize as an empty Series

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None:
    # Basic LGBM params for initial CV
    # These will be merged with/override defaults in the perform_cross_validation function
    initial_lgbm_params = {
        'n_estimators': 100, # Example: function's default might be different
        'learning_rate': 0.1, # Example
        'random_state': RANDOM_STATE
        # The modular function defines other necessary defaults like objective, metric, label_gain, eval_at etc.
    }
    
    print(f"\n--- Performing {N_FOLDS_CV}-Fold Cross-Validation using modular function ---")
    mean_cv_ndcg, std_cv_ndcg, cv_feature_importances = lgbm_model.perform_cross_validation(
        X, y, 
        groups_for_splitting=groups_for_splitting, # This is df_sample['srch_id']
        df_full_for_group_counts=df_sample, # Pass df_sample, as it contains 'srch_id' needed for group counts
        n_folds=N_FOLDS_CV,
        lgbm_params=initial_lgbm_params
    )
    
    print(f"\nCross-Validation Mean NDCG@5: {mean_cv_ndcg:.4f} +/- {std_cv_ndcg:.4f}")
    if not cv_feature_importances.empty:
        print("\nAverage Feature Importances from CV:")
        with pd.option_context('display.max_rows', 30): # Display top 20 or all if less than 20
            display(cv_feature_importances.head(20))
else:
    print("\nSkipping Cross-Validation due to missing X, y, groups_for_splitting, or df_sample.")


--- Performing 5-Fold Cross-Validation using modular function ---
\n--- Performing 5-Fold Cross-Validation ---
--- Fold 1/5 ---
Fold 1 NDCG@5: 0.3428
--- Fold 2/5 ---
Fold 2 NDCG@5: 0.3423
--- Fold 3/5 ---
Fold 3 NDCG@5: 0.3455
--- Fold 4/5 ---
Fold 4 NDCG@5: 0.3530
--- Fold 5/5 ---
Fold 5 NDCG@5: 0.3536
Mean NDCG@5 across 5 folds: 0.3475 +/- 0.0049

Cross-Validation Mean NDCG@5: 0.3475 +/- 0.0049

Average Feature Importances from CV:


feature
prop_location_score2           15831.198095
price_usd                      11461.798124
prop_location_score1            7153.288665
prop_starrating                 5405.245977
prop_log_historical_price       5379.496554
prop_review_score               2834.064519
promotion_flag                  2247.297013
orig_destination_distance       1730.336422
prop_country_id                 1018.822620
visitor_location_country_id      507.976540
prop_brand_bool                  428.788319
Name: importance, dtype: float64

In [6]:
# --- 6. Hyperparameter Tuning with Optuna (using modular function) ---
best_params_from_tuning = {} # Initialize

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None:
    print(f"\n--- Tuning Hyperparameters with Optuna ({N_OPTUNA_TRIALS} trials, {N_FOLDS_TUNING} CV folds each) using modular function ---")
    
    # Suppress Optuna's verbosity if it's too much, and LightGBM warnings during tuning.
    # import optuna # Optuna is imported within lgbm_model.py where tune_hyperparameters_optuna is defined.
    # optuna.logging.set_verbosity(optuna.logging.WARNING) # You can set this in lgbm_model.py if desired globally for the function
    
    # It's good practice to manage warnings that might clutter the output during tuning.
    # The lgbm_model.py file could also handle these internally if preferred.
    warnings.filterwarnings('ignore', message='Found \'eval_at\' in params.*') # Suppress LightGBM's specific warning
    warnings.filterwarnings('ignore', message='Overriding the init_model argument.*') # Another potential LightGBM warning

    best_params_from_tuning = lgbm_model.tune_hyperparameters_optuna(
        X, 
        y, 
        groups_for_splitting=groups_for_splitting, # This is df_sample['srch_id']
        df_full_for_group_counts=df_sample, # df_sample for calculating group sizes within folds
        n_trials=N_OPTUNA_TRIALS, 
        n_cv_folds=N_FOLDS_TUNING
    )
    
    if best_params_from_tuning:
        print("\nBest parameters found by Optuna:")
        for key, value in best_params_from_tuning.items():
            print(f"    {key}: {value}")
    else:
        print("\nOptuna tuning did not return parameters. Using default parameters for the final model evaluation.")
        # Fallback to some sensible defaults if tuning fails or is skipped
        best_params_from_tuning = { 
            'n_estimators': 200, 'learning_rate': 0.05, 'num_leaves': 31, 
            'max_depth': 7, 'min_child_samples': 20, 'subsample': 0.8,
            'colsample_bytree':0.8, 'reg_alpha':0.1, 'reg_lambda':0.1
            # Add other necessary LGBM parameters if not covered by the module's defaults
        } 
else:
    print("\nSkipping Hyperparameter Tuning due to missing X, y, groups_for_splitting, or df_sample.")
    # Fallback parameters if tuning is skipped
    best_params_from_tuning = { 
        'n_estimators': 200, 'learning_rate': 0.05, 'num_leaves': 31, 
        'max_depth': 7, 'min_child_samples': 20, 'subsample': 0.8,
        'colsample_bytree':0.8, 'reg_alpha':0.1, 'reg_lambda':0.1
    }

# Reset warnings to default behavior if they were changed
warnings.resetwarnings()

[I 2025-05-14 15:11:15,776] A new study created in memory with name: lgbm_ranker_tuning



--- Tuning Hyperparameters with Optuna (20 trials, 3 CV folds each) using modular function ---
\n--- Tuning Hyperparameters with Optuna (20 trials, 3 CV folds each) ---


[I 2025-05-14 15:11:19,215] Trial 0 finished with value: 0.3483959691013827 and parameters: {'n_estimators': 350, 'learning_rate': 0.10148640916203272, 'num_leaves': 26, 'max_depth': 12, 'min_child_samples': 6, 'subsample': 0.5604481619303492, 'colsample_bytree': 0.7128952769140711, 'reg_alpha': 5.632050975812048, 'reg_lambda': 0.01393055140628837}. Best is trial 0 with value: 0.3483959691013827.
[I 2025-05-14 15:11:22,750] Trial 1 finished with value: 0.3487982556281389 and parameters: {'n_estimators': 700, 'learning_rate': 0.07738743799224009, 'num_leaves': 50, 'max_depth': 8, 'min_child_samples': 30, 'subsample': 0.6195409659098858, 'colsample_bytree': 0.9036249138323806, 'reg_alpha': 7.446041659683735, 'reg_lambda': 6.2495614061886355}. Best is trial 1 with value: 0.3487982556281389.
[I 2025-05-14 15:11:24,863] Trial 2 finished with value: 0.35174829820481407 and parameters: {'n_estimators': 400, 'learning_rate': 0.1734440198649952, 'num_leaves': 36, 'max_depth': 4, 'min_child_samp

Optuna study finished. Best trial NDCG@5: 0.3517
Best parameters: {'n_estimators': 400, 'learning_rate': 0.1734440198649952, 'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 58, 'subsample': 0.994009316145284, 'colsample_bytree': 0.7229225094690845, 'reg_alpha': 0.002544358343653164, 'reg_lambda': 0.1930789012781852}

Best parameters found by Optuna:
    n_estimators: 400
    learning_rate: 0.1734440198649952
    num_leaves: 36
    max_depth: 4
    min_child_samples: 58
    subsample: 0.994009316145284
    colsample_bytree: 0.7229225094690845
    reg_alpha: 0.002544358343653164
    reg_lambda: 0.1930789012781852


In [7]:
# --- 7. Train Final Model on Full Sampled Data (using modular function) ---
final_trained_model = None

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None and best_params_from_tuning:
    print("\\n--- Training Final Model with Best Tuned Parameters ---")
    
    # groups_train_full is needed for the lgbm_model.train_final_model function
    # It should represent the group sizes for the entire X, y that's being passed
    # This X is df_sample[feature_columns]
    groups_train_full = df_sample.groupby('srch_id').size().to_numpy()

    if len(groups_train_full) > 0:
        final_trained_model = lgbm_model.train_final_model(
            X_train_full=X,  # This is the full X from df_sample
            y_train_full=y,  # This is the full y from df_sample
            groups_train_full=groups_train_full,
            df_full_for_group_counts=df_sample, # df_sample contains 'srch_id' for early stopping split
            best_params=best_params_from_tuning
        )
        if final_trained_model:
            print("Final model successfully trained.")
        else:
            print("Final model training failed or returned None.")
    else:
        print("Cannot train final model: No groups found in the training data.")
else:
    print("\\nSkipping final model training due to missing data, groups, or best_params_from_tuning.")


\n--- Training Final Model with Best Tuned Parameters ---
\n--- Training Final Model ---
Final model parameters for training:
{'objective': 'lambdarank', 'metric': 'ndcg', 'label_gain': [0, 1, 5], 'eval_at': [5], 'importance_type': 'gain', 'random_state': 42, 'n_jobs': -1, 'verbosity': -1, 'n_estimators': 400, 'learning_rate': 0.1734440198649952, 'num_leaves': 36, 'max_depth': 4, 'min_child_samples': 58, 'subsample': 0.994009316145284, 'colsample_bytree': 0.7229225094690845, 'reg_alpha': 0.002544358343653164, 'reg_lambda': 0.1930789012781852}


  return bound(*args, **kwds)


Fitting final model with early stopping on 90/10 split of training data.
Training until validation scores don't improve for 10 rounds




Early stopping, best iteration is:
[58]	valid_0's ndcg@5: 0.363617
Final model training completed.
Final model successfully trained.


In [9]:
# --- 8. Prepare Test Data and Generate Kaggle Submission (using modular function) ---

if final_trained_model is not None and X is not None and 'feature_columns' in locals() and feature_columns:
    print("\\n--- Preparing Test Data and Generating Kaggle Submission ---")

    # --- 8a. Load Test Data ---
    print(f"Loading test data from: {TEST_FILE}...")
    try:
        df_test_raw = pd.read_csv(TEST_FILE)
        print(f"Loaded test dataset with shape: {df_test_raw.shape}")
    except FileNotFoundError:
        print(f"ERROR: Test file not found at {TEST_FILE}")
        df_test_raw = None
    except Exception as e:
        print(f"Error loading test data: {e}")
        df_test_raw = None

    if df_test_raw is not None:
        # --- 8b. Preprocess Test Data ---
        print("\\nPreprocessing test data...")
        
        # Ensure all selected feature columns exist in df_test_raw
        # and handle any missing columns gracefully if necessary (e.g. by creating them with NaNs)
        X_test_list = []
        for col in feature_columns:
            if col not in df_test_raw.columns:
                print(f"Warning: Feature column '{col}' not found in test data. Creating it with NaNs.")
                df_test_raw[col] = np.nan 
        
        X_test = df_test_raw[feature_columns].copy()

        # Impute missing values in X_test using medians from the TRAINING sample (X)
        # X should be the dataframe of features used for training the final_trained_model
        print("Imputing missing values in test data using training set medians...")
        nan_counts_before_imputation = X_test.isnull().sum()

        for col in X_test.columns:
            if X_test[col].isnull().any():
                if pd.api.types.is_numeric_dtype(X_test[col]):
                    if col in X.columns: # Ensure the column exists in the training features X
                        train_median = X[col].median() # Calculate median from the TRAIN features (X)
                        X_test[col].fillna(train_median, inplace=True)
                        # print(f"Imputed NaNs in test column '{col}' with training median: {train_median}")
                    else:
                        print(f"Warning: Column '{col}' for median imputation not found in training X. Test NaNs may remain.")
                # else: # For categorical, use mode from training X
                    # if col in X.columns:
                    #     train_mode = X[col].mode()[0]
                    #     X_test[col].fillna(train_mode, inplace=True)
                    # else:
                    #     print(f"Warning: Column '{col}' for mode imputation not found in training X. Test NaNs may remain.")
        
        nan_counts_after_imputation = X_test.isnull().sum().sum()
        print(f"NaNs remaining in X_test after imputation: {nan_counts_after_imputation}")
        if nan_counts_after_imputation > 0:
            print("Warning: Some NaNs remain in test features after imputation. Review missing columns or imputation logic.")
            print(X_test.isnull().sum()[X_test.isnull().sum() > 0])


        # --- 8c. Generate Submission File ---
        # The df_test_raw contains 'srch_id' and 'prop_id' needed by the submission function
        lgbm_model.predict_and_format_submission(
            model=final_trained_model,
            X_test=X_test,
            df_test_original_ids=df_test_raw, # Pass the raw test df for srch_id and prop_id
            submission_filename=SUBMISSION_FILENAME
        )
    else:
        print("Skipping submission generation as test data could not be loaded.")
else:
    print("\\nSkipping Kaggle submission: final_trained_model, X, or feature_columns not available.")


\n--- Preparing Test Data and Generating Kaggle Submission ---
Loading test data from: ../data.nosync/test.csv...
Loaded test dataset with shape: (4959183, 50)
\nPreprocessing test data...
Imputing missing values in test data using training set medians...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(train_median, inplace=True)


NaNs remaining in X_test after imputation: 0
\n--- Predicting on Test Data and Formatting Submission ---
Submission file 'submission_modular.csv' created. Top 5 rows:
   SearchId  PropertyId
0         1       54937
1         1       99484
2         1       61934
3         1       24194
4         1       28181


In [16]:
'''

# --- 4. Initial Feature Selection (Placeholder) ---
# This will be refined by your friend. For now, using a subset of potentially useful features.
if 'df_sample' in locals() and df_sample is not None:
    print("\nDefining initial feature set...")
    # Features identified as potentially important or commonly used, excluding IDs and target-leaking columns
    # Also excluding competitor columns with high missing rates for now, and user history due to high missingness
    feature_columns = [
        'visitor_location_country_id', 'prop_country_id',
        'prop_starrating', 'prop_review_score', 'prop_brand_bool',
        'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price',
        'price_usd', 'promotion_flag', 'orig_destination_distance'
        # 'srch_query_affinity_score' # high missingness
        # Add more features here as EDA suggests and after handling missing values
    ]

    # For ranking, we need features (X), relevance (y), and group/query_id
    X = df_sample[feature_columns]
    y = df_sample['relevance']
    groups = df_sample.groupby('srch_id').size().to_numpy() # Size of each group

    print(f"Selected {len(feature_columns)} features.")
    print("Feature columns:", feature_columns)
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape}")
    print(f"Number of groups: {len(groups)}, Min group size: {groups.min()}, Max group size: {groups.max()}")
    '''
    # Handle Missing Values (Simple Imputation for now)
    # For a proper model, more sophisticated imputation or feature engineering for missingness is needed.
    print("\nHandling missing values (simple median imputation for numerical)...")
    for col in X.columns:
        if X[col].isnull().any():
            if pd.api.types.is_numeric_dtype(X[col]):
                X[col] = X[col].fillna(X[col].median())
                print(f"Imputed missing values in {col} with median.")
            # else: # For categorical, fill with mode or a specific placeholder
            #     X[col] = X[col].fillna(X[col].mode()[0])

    # Check if any NaNs remain (should ideally be none for numeric after this)
    '''
else:
    print("Skipping feature selection due to data sampling issues.")
print("NaNs remaining in X", X.isnull().sum().sum())
'''



Defining initial feature set...
Selected 11 features.
Feature columns: ['visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'orig_destination_distance']
Shape of X: (496212, 11)
Shape of y: (496212,)
Number of groups: 19979, Min group size: 5, Max group size: 38
NaNs remaining in X 0


In [17]:
# --- 5. Model Training (LGBMRanker) ---
# This section will contain the training loop using GroupKFold

if 'X' in locals() and 'y' in locals() and df_sample is not None and X is not None: # Use df_sample for srch_id
    print("\n--- 5. Cross-Validation with GroupKFold ---")

    gkf = GroupKFold(n_splits=N_FOLDS)
    
    fold_ndcg_scores = []
    all_feature_importances = pd.DataFrame()

    # The groups parameter for gkf.split should be the srch_id for each row in X
    # It ensures that rows with the same srch_id are not split across train/test in a fold.
    unique_group_ids_for_splitting = df_sample['srch_id']

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=unique_group_ids_for_splitting)):
        print(f"\n--- Fold {fold+1}/{N_FOLDS} ---")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Group counts for the current training and validation fold
        # Need to use the original df_sample with srch_id to correctly form groups for the subsets
        train_groups = df_sample.iloc[train_idx].groupby('srch_id').size().to_numpy()
        val_groups = df_sample.iloc[val_idx].groupby('srch_id').size().to_numpy()
        
        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, Num train groups: {len(train_groups)}")
        print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}, Num val groups: {len(val_groups)}")
        
        if len(train_groups) == 0 or len(val_groups) == 0 or X_train.empty or X_val.empty:
            print("Skipping fold due to empty train or validation groups/data.")
            continue

        ranker_cv = lgb.LGBMRanker(
            objective='lambdarank',
            metric='ndcg',
            label_gain=[0, 1, 5], # Corresponds to relevance 0, 1, 5
            n_estimators=100, 
            learning_rate=0.1,
            importance_type='gain',
            random_state=RANDOM_STATE + fold, 
            n_jobs=-1,
        )

        print(f"Training LGBMRanker for fold {fold+1}...")
        ranker_cv.fit(
            X_train,
            y_train,
            group=train_groups,
            eval_set=[(X_val, y_val)],
            eval_group=[val_groups],
            eval_metric='ndcg', 
            callbacks=[lgb.early_stopping(10, verbose=1)]
        )
        
        if ranker_cv.evals_result_ and 'valid_0' in ranker_cv.evals_result_ and 'ndcg@5' in ranker_cv.evals_result_['valid_0']:
            ndcg_at_5 = ranker_cv.evals_result_['valid_0']['ndcg@5'][-1] 
            fold_ndcg_scores.append(ndcg_at_5)
            print(f"Fold {fold+1} NDCG@5: {ndcg_at_5:.4f}")

            fold_importances = pd.DataFrame({
                'feature': X_train.columns,
                'importance': ranker_cv.feature_importances_,
                'fold': fold + 1
            })
            all_feature_importances = pd.concat([all_feature_importances, fold_importances], ignore_index=True)
        else:
            print(f"Could not retrieve NDCG@5 for fold {fold+1}. Skipping score for this fold.")


    if fold_ndcg_scores:
        print(f"\nMean NDCG@5 across {len(fold_ndcg_scores)} successfully evaluated folds: {np.mean(fold_ndcg_scores):.4f} +/- {np.std(fold_ndcg_scores):.4f}")
        
        if not all_feature_importances.empty:
            mean_feature_importances = all_feature_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)
            print("\nAverage Feature Importances (Cross-Validation):")
            with pd.option_context('display.max_rows', 30):
                display(mean_feature_importances.head(20))
    else:
        print("No folds were successfully processed with NDCG scores.")

else:
    print("\nSkipping model training and cross-validation due to earlier data processing issues.")




--- 5. Cross-Validation with GroupKFold ---

--- Fold 1/5 ---
X_train shape: (396968, 11), y_train shape: (396968,), Num train groups: 15983
X_val shape: (99244, 11), y_val shape: (99244,), Num val groups: 3996
Training LGBMRanker for fold 1...
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[10]	valid_0's ndcg@1: 0.175776	valid_0's ndcg@2: 0.243204	valid_0's ndcg@3: 0.286728	valid_0's ndcg@4: 0.322332	valid_0's ndcg@5: 0.347355
Fold 1 NDCG@5: 0.3465

--- Fold 2/5 ---
X_train shape: (396973, 11), y_train shape: (396973,), Num train groups: 15984
X_val shape: (99239, 11), y_val shape: (99239,), Num val groups: 3995
Training LGBMRanker for fold 2...
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[13]	valid_0's ndcg@1: 0.181477	valid_0's ndcg@2: 0.249574	valid_0's ndcg@3: 0.294896	valid_0's ndcg@4: 0.329653	valid_0's ndcg@5: 0.354915
Fold 2 NDCG@5: 0.3558

--- Fold 3/5 ---
X_train shape: (396

feature
prop_location_score2           14327.886284
price_usd                       9679.488317
prop_location_score1            5469.095838
prop_starrating                 4524.345251
prop_log_historical_price       3634.961211
promotion_flag                  2230.896334
prop_review_score               2186.490114
orig_destination_distance       1154.970879
prop_country_id                  679.057501
visitor_location_country_id      240.637339
prop_brand_bool                  180.776899
Name: importance, dtype: float64

In [18]:

# --- 6. Hyperparameter Tuning with Optuna ---
if 'X' in locals() and 'y' in locals() and df_sample is not None and X is not None:
    print("\n--- 6. Hyperparameter Tuning with Optuna ---")
    import optuna
    # optuna.logging.set_verbosity(optuna.logging.WARNING) # Reduce Optuna's verbosity if needed

    # Define the objective function for Optuna
    def objective(trial):
        # Define search space for hyperparameters
        params = {
            'objective': 'lambdarank',
            'metric': 'ndcg',
            'label_gain': [0, 1, 5], # If using remapped (0,1,2) relevance
            'eval_at': [5],
            'random_state': RANDOM_STATE,
            'n_jobs': -1, # Use all available cores for LGBM training
            'verbosity': -1, # Suppress LightGBM's own verbosity during tuning trials
            'boosting_type': 'gbdt', # Default, but can be tuned

            'n_estimators': trial.suggest_int('n_estimators', 100, 700, step=50),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),    # L1 regularization
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True), # L2 regularization
        }

        gkf_for_optuna = GroupKFold(n_splits=3) # Use 3 folds for faster tuning trials
        fold_ndcg_scores = []
        
        # Ensure unique_group_ids_for_splitting is available in this scope
        # It should be df_sample['srch_id']
        current_groups_for_splitting = df_sample['srch_id']


        for fold, (train_idx, val_idx) in enumerate(gkf_for_optuna.split(X, y, groups=current_groups_for_splitting)):
            X_train_trial, X_val_trial = X.iloc[train_idx], X.iloc[val_idx]
            y_train_trial, y_val_trial = y.iloc[train_idx], y.iloc[val_idx]

            train_groups_trial = df_sample.iloc[train_idx].groupby('srch_id').size().to_numpy()
            val_groups_trial = df_sample.iloc[val_idx].groupby('srch_id').size().to_numpy()
            
            if len(train_groups_trial) == 0 or len(val_groups_trial) == 0 or X_train_trial.empty or X_val_trial.empty:
                # print(f"Skipping fold {fold+1} in trial {trial.number} due to empty groups/data.")
                # Return a very low score if a fold fails, to penalize these hyperparams
                return -1.0 # Or handle as per Optuna's pruning/failure guidelines

            model_trial = lgb.LGBMRanker(**params)
            
            model_trial.fit(
                X_train_trial, y_train_trial, group=train_groups_trial,
                eval_set=[(X_val_trial, y_val_trial)],
                eval_group=[val_groups_trial],
                eval_metric='ndcg', # LightGBM will use metric from params, this is for eval_set
                callbacks=[lgb.early_stopping(10, verbose=False)]
            )
            
            if model_trial.evals_result_ and 'valid_0' in model_trial.evals_result_ and 'ndcg@5' in model_trial.evals_result_['valid_0']:
                score = model_trial.evals_result_['valid_0']['ndcg@5'][-1]
                fold_ndcg_scores.append(score)
            else:
                # print(f"NDCG@5 not found for fold {fold+1} in trial {trial.number}.")
                fold_ndcg_scores.append(0.0) # Penalize if score not found

            # Optuna Pruning (optional, but good for long searches)
            # trial.report(score, fold)
            # if trial.should_prune():
            #     raise optuna.exceptions.TrialPruned()

        avg_ndcg = np.mean(fold_ndcg_scores) if fold_ndcg_scores else 0.0
        print(f"Trial {trial.number} finished with avg NDCG@5: {avg_ndcg:.4f} for params: {trial.params}")
        return avg_ndcg

    # Create a study object and optimize the objective function.
    # n_trials: number of hyperparameter combinations to test.
    # Adjust based on available time (e.g., 20-50 for a decent search, 100+ for more thorough).
    N_OPTUNA_TRIALS = 20 # Start with a smaller number for testing
    # Suppress the eval_at warning from LightGBM
    import warnings
    warnings.filterwarnings('ignore', message='Found \'eval_at\' in params.*')
    study = optuna.create_study(direction='maximize', study_name='lgbm_ranker_optimization')
    
    print(f"Starting Optuna hyperparameter tuning with {N_OPTUNA_TRIALS} trials...")
    try:
        study.optimize(objective, n_trials=N_OPTUNA_TRIALS, timeout=None) # timeout in seconds if needed

        print("\nOptuna study statistics:")
        print(f"  Number of finished trials: {len(study.trials)}")
        
        best_trial = study.best_trial
        print(f"  Best trial value (NDCG@5): {best_trial.value:.4f}")
        print("  Best parameters found by Optuna:")
        for key, value in best_trial.params.items():
            print(f"    {key}: {value}")
        
        best_params_from_tuning = best_trial.params

    except Exception as e:
        print(f"Error during Optuna optimization: {e}")
        print("Falling back to default parameters for the final model evaluation.")
        best_params_from_tuning = { # Default fallback
            'n_estimators': 100, 'learning_rate': 0.1, 'num_leaves': 31,
            'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0,
            'colsample_bytree': 1.0, 'reg_alpha': 0.0, 'reg_lambda': 0.0
        }
else:
    print("\nSkipping Optuna hyperparameter tuning due to earlier data processing issues.")
    best_params_from_tuning = { # Default fallback
        'n_estimators': 100, 'learning_rate': 0.1, 'num_leaves': 31,
        'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0,
        'colsample_bytree': 1.0, 'reg_alpha': 0.0, 'reg_lambda': 0.0
    }

# The rest of your notebook (Section 7: Detailed Evaluation, Section 8: Submission)
# should now use `best_params_from_tuning` obtained from Optuna.


[I 2025-05-14 14:35:15,778] A new study created in memory with name: lgbm_ranker_optimization



--- 6. Hyperparameter Tuning with Optuna ---
Starting Optuna hyperparameter tuning with 20 trials...


[I 2025-05-14 14:35:18,278] Trial 0 finished with value: 0.3504321633173449 and parameters: {'n_estimators': 450, 'learning_rate': 0.11408934177550577, 'num_leaves': 141, 'max_depth': 3, 'min_child_samples': 89, 'subsample': 0.742713365157357, 'colsample_bytree': 0.5320066224512843, 'reg_alpha': 0.04821567442910248, 'reg_lambda': 3.223570664091658}. Best is trial 0 with value: 0.3504321633173449.


Trial 0 finished with avg NDCG@5: 0.3504 for params: {'n_estimators': 450, 'learning_rate': 0.11408934177550577, 'num_leaves': 141, 'max_depth': 3, 'min_child_samples': 89, 'subsample': 0.742713365157357, 'colsample_bytree': 0.5320066224512843, 'reg_alpha': 0.04821567442910248, 'reg_lambda': 3.223570664091658}


[I 2025-05-14 14:35:21,937] Trial 1 finished with value: 0.3422898201628106 and parameters: {'n_estimators': 500, 'learning_rate': 0.023883932378738804, 'num_leaves': 94, 'max_depth': 9, 'min_child_samples': 45, 'subsample': 0.9700038221215604, 'colsample_bytree': 0.6083726787277264, 'reg_alpha': 0.5414737974882534, 'reg_lambda': 0.16431215973222515}. Best is trial 0 with value: 0.3504321633173449.


Trial 1 finished with avg NDCG@5: 0.3423 for params: {'n_estimators': 500, 'learning_rate': 0.023883932378738804, 'num_leaves': 94, 'max_depth': 9, 'min_child_samples': 45, 'subsample': 0.9700038221215604, 'colsample_bytree': 0.6083726787277264, 'reg_alpha': 0.5414737974882534, 'reg_lambda': 0.16431215973222515}


[I 2025-05-14 14:35:26,689] Trial 2 finished with value: 0.34334889630304505 and parameters: {'n_estimators': 650, 'learning_rate': 0.011460995266279054, 'num_leaves': 68, 'max_depth': 8, 'min_child_samples': 96, 'subsample': 0.9320762121405267, 'colsample_bytree': 0.835334689821756, 'reg_alpha': 1.2410789768716448, 'reg_lambda': 0.010488226621413828}. Best is trial 0 with value: 0.3504321633173449.


Trial 2 finished with avg NDCG@5: 0.3433 for params: {'n_estimators': 650, 'learning_rate': 0.011460995266279054, 'num_leaves': 68, 'max_depth': 8, 'min_child_samples': 96, 'subsample': 0.9320762121405267, 'colsample_bytree': 0.835334689821756, 'reg_alpha': 1.2410789768716448, 'reg_lambda': 0.010488226621413828}


[I 2025-05-14 14:35:29,385] Trial 3 finished with value: 0.34446429747725 and parameters: {'n_estimators': 400, 'learning_rate': 0.020422862992246896, 'num_leaves': 119, 'max_depth': 5, 'min_child_samples': 19, 'subsample': 0.9859988258949455, 'colsample_bytree': 0.7882728241038102, 'reg_alpha': 0.001773388068494822, 'reg_lambda': 1.3288807072298627}. Best is trial 0 with value: 0.3504321633173449.


Trial 3 finished with avg NDCG@5: 0.3445 for params: {'n_estimators': 400, 'learning_rate': 0.020422862992246896, 'num_leaves': 119, 'max_depth': 5, 'min_child_samples': 19, 'subsample': 0.9859988258949455, 'colsample_bytree': 0.7882728241038102, 'reg_alpha': 0.001773388068494822, 'reg_lambda': 1.3288807072298627}


[I 2025-05-14 14:35:36,913] Trial 4 finished with value: 0.34130853034132386 and parameters: {'n_estimators': 500, 'learning_rate': 0.05114946266836074, 'num_leaves': 119, 'max_depth': 12, 'min_child_samples': 62, 'subsample': 0.7523177275144902, 'colsample_bytree': 0.6353110156582764, 'reg_alpha': 0.4966644373135883, 'reg_lambda': 0.0031925998125138548}. Best is trial 0 with value: 0.3504321633173449.


Trial 4 finished with avg NDCG@5: 0.3413 for params: {'n_estimators': 500, 'learning_rate': 0.05114946266836074, 'num_leaves': 119, 'max_depth': 12, 'min_child_samples': 62, 'subsample': 0.7523177275144902, 'colsample_bytree': 0.6353110156582764, 'reg_alpha': 0.4966644373135883, 'reg_lambda': 0.0031925998125138548}


[I 2025-05-14 14:35:40,844] Trial 5 finished with value: 0.34902853241388737 and parameters: {'n_estimators': 650, 'learning_rate': 0.1867730266650586, 'num_leaves': 37, 'max_depth': 5, 'min_child_samples': 6, 'subsample': 0.8952857035223396, 'colsample_bytree': 0.9394224688614666, 'reg_alpha': 1.8220558586250621, 'reg_lambda': 0.0033285803696310157}. Best is trial 0 with value: 0.3504321633173449.


Trial 5 finished with avg NDCG@5: 0.3490 for params: {'n_estimators': 650, 'learning_rate': 0.1867730266650586, 'num_leaves': 37, 'max_depth': 5, 'min_child_samples': 6, 'subsample': 0.8952857035223396, 'colsample_bytree': 0.9394224688614666, 'reg_alpha': 1.8220558586250621, 'reg_lambda': 0.0033285803696310157}


[I 2025-05-14 14:35:43,799] Trial 6 finished with value: 0.3472267476583742 and parameters: {'n_estimators': 400, 'learning_rate': 0.05771832260300673, 'num_leaves': 36, 'max_depth': 12, 'min_child_samples': 40, 'subsample': 0.8544706074506001, 'colsample_bytree': 0.7686597943736168, 'reg_alpha': 3.6007680684135273, 'reg_lambda': 0.0038557128045937443}. Best is trial 0 with value: 0.3504321633173449.


Trial 6 finished with avg NDCG@5: 0.3472 for params: {'n_estimators': 400, 'learning_rate': 0.05771832260300673, 'num_leaves': 36, 'max_depth': 12, 'min_child_samples': 40, 'subsample': 0.8544706074506001, 'colsample_bytree': 0.7686597943736168, 'reg_alpha': 3.6007680684135273, 'reg_lambda': 0.0038557128045937443}


[I 2025-05-14 14:35:47,831] Trial 7 finished with value: 0.34637040381962886 and parameters: {'n_estimators': 150, 'learning_rate': 0.02898696540038131, 'num_leaves': 58, 'max_depth': 4, 'min_child_samples': 9, 'subsample': 0.5273621471489346, 'colsample_bytree': 0.7953613676662146, 'reg_alpha': 0.02919579194727492, 'reg_lambda': 0.002407489331568478}. Best is trial 0 with value: 0.3504321633173449.


Trial 7 finished with avg NDCG@5: 0.3464 for params: {'n_estimators': 150, 'learning_rate': 0.02898696540038131, 'num_leaves': 58, 'max_depth': 4, 'min_child_samples': 9, 'subsample': 0.5273621471489346, 'colsample_bytree': 0.7953613676662146, 'reg_alpha': 0.02919579194727492, 'reg_lambda': 0.002407489331568478}


[I 2025-05-14 14:35:54,040] Trial 8 finished with value: 0.34649392407993274 and parameters: {'n_estimators': 100, 'learning_rate': 0.09138350288839198, 'num_leaves': 59, 'max_depth': 11, 'min_child_samples': 16, 'subsample': 0.9271397411020879, 'colsample_bytree': 0.8985484051015569, 'reg_alpha': 7.915355296001399, 'reg_lambda': 0.1134434971599035}. Best is trial 0 with value: 0.3504321633173449.


Trial 8 finished with avg NDCG@5: 0.3465 for params: {'n_estimators': 100, 'learning_rate': 0.09138350288839198, 'num_leaves': 59, 'max_depth': 11, 'min_child_samples': 16, 'subsample': 0.9271397411020879, 'colsample_bytree': 0.8985484051015569, 'reg_alpha': 7.915355296001399, 'reg_lambda': 0.1134434971599035}


[I 2025-05-14 14:35:58,497] Trial 9 finished with value: 0.3396182190155428 and parameters: {'n_estimators': 350, 'learning_rate': 0.014277687851517654, 'num_leaves': 106, 'max_depth': 9, 'min_child_samples': 73, 'subsample': 0.7253424780133684, 'colsample_bytree': 0.8242562128537649, 'reg_alpha': 0.04690920987986248, 'reg_lambda': 0.0072856035793310995}. Best is trial 0 with value: 0.3504321633173449.


Trial 9 finished with avg NDCG@5: 0.3396 for params: {'n_estimators': 350, 'learning_rate': 0.014277687851517654, 'num_leaves': 106, 'max_depth': 9, 'min_child_samples': 73, 'subsample': 0.7253424780133684, 'colsample_bytree': 0.8242562128537649, 'reg_alpha': 0.04690920987986248, 'reg_lambda': 0.0072856035793310995}


[I 2025-05-14 14:36:01,760] Trial 10 finished with value: 0.35097621989821665 and parameters: {'n_estimators': 250, 'learning_rate': 0.19678085320205846, 'num_leaves': 144, 'max_depth': 3, 'min_child_samples': 96, 'subsample': 0.6005514711808235, 'colsample_bytree': 0.5002926412539057, 'reg_alpha': 0.003205944770438764, 'reg_lambda': 5.229994576420763}. Best is trial 10 with value: 0.35097621989821665.


Trial 10 finished with avg NDCG@5: 0.3510 for params: {'n_estimators': 250, 'learning_rate': 0.19678085320205846, 'num_leaves': 144, 'max_depth': 3, 'min_child_samples': 96, 'subsample': 0.6005514711808235, 'colsample_bytree': 0.5002926412539057, 'reg_alpha': 0.003205944770438764, 'reg_lambda': 5.229994576420763}


[I 2025-05-14 14:36:03,650] Trial 11 finished with value: 0.3500807466476193 and parameters: {'n_estimators': 250, 'learning_rate': 0.18936480103618095, 'num_leaves': 150, 'max_depth': 3, 'min_child_samples': 99, 'subsample': 0.5676878704016968, 'colsample_bytree': 0.5039312049704797, 'reg_alpha': 0.004342778532384315, 'reg_lambda': 8.045625028445603}. Best is trial 10 with value: 0.35097621989821665.


Trial 11 finished with avg NDCG@5: 0.3501 for params: {'n_estimators': 250, 'learning_rate': 0.18936480103618095, 'num_leaves': 150, 'max_depth': 3, 'min_child_samples': 99, 'subsample': 0.5676878704016968, 'colsample_bytree': 0.5039312049704797, 'reg_alpha': 0.004342778532384315, 'reg_lambda': 8.045625028445603}


[I 2025-05-14 14:36:05,868] Trial 12 finished with value: 0.34974115980416576 and parameters: {'n_estimators': 250, 'learning_rate': 0.10097812296423751, 'num_leaves': 149, 'max_depth': 3, 'min_child_samples': 83, 'subsample': 0.6279780895866396, 'colsample_bytree': 0.5079338671091401, 'reg_alpha': 0.012226775152515613, 'reg_lambda': 7.4086301913364485}. Best is trial 10 with value: 0.35097621989821665.


Trial 12 finished with avg NDCG@5: 0.3497 for params: {'n_estimators': 250, 'learning_rate': 0.10097812296423751, 'num_leaves': 149, 'max_depth': 3, 'min_child_samples': 83, 'subsample': 0.6279780895866396, 'colsample_bytree': 0.5079338671091401, 'reg_alpha': 0.012226775152515613, 'reg_lambda': 7.4086301913364485}


[I 2025-05-14 14:36:12,124] Trial 13 finished with value: 0.34525103317223094 and parameters: {'n_estimators': 500, 'learning_rate': 0.1240600518337226, 'num_leaves': 131, 'max_depth': 6, 'min_child_samples': 83, 'subsample': 0.6861765626698988, 'colsample_bytree': 0.6189955340483645, 'reg_alpha': 0.1607115771839014, 'reg_lambda': 1.133130334323549}. Best is trial 10 with value: 0.35097621989821665.


Trial 13 finished with avg NDCG@5: 0.3453 for params: {'n_estimators': 500, 'learning_rate': 0.1240600518337226, 'num_leaves': 131, 'max_depth': 6, 'min_child_samples': 83, 'subsample': 0.6861765626698988, 'colsample_bytree': 0.6189955340483645, 'reg_alpha': 0.1607115771839014, 'reg_lambda': 1.133130334323549}


[I 2025-05-14 14:36:16,504] Trial 14 finished with value: 0.34284177868202637 and parameters: {'n_estimators': 300, 'learning_rate': 0.13326234853568378, 'num_leaves': 133, 'max_depth': 6, 'min_child_samples': 85, 'subsample': 0.80994357822961, 'colsample_bytree': 0.5688778167925024, 'reg_alpha': 0.0012361045529986699, 'reg_lambda': 1.0114461760154043}. Best is trial 10 with value: 0.35097621989821665.


Trial 14 finished with avg NDCG@5: 0.3428 for params: {'n_estimators': 300, 'learning_rate': 0.13326234853568378, 'num_leaves': 133, 'max_depth': 6, 'min_child_samples': 85, 'subsample': 0.80994357822961, 'colsample_bytree': 0.5688778167925024, 'reg_alpha': 0.0012361045529986699, 'reg_lambda': 1.0114461760154043}


[I 2025-05-14 14:36:19,034] Trial 15 finished with value: 0.3471746349948053 and parameters: {'n_estimators': 200, 'learning_rate': 0.07274464976639444, 'num_leaves': 86, 'max_depth': 3, 'min_child_samples': 63, 'subsample': 0.6204563314152839, 'colsample_bytree': 0.6754314569540577, 'reg_alpha': 0.007994509655092024, 'reg_lambda': 3.138200250800137}. Best is trial 10 with value: 0.35097621989821665.


Trial 15 finished with avg NDCG@5: 0.3472 for params: {'n_estimators': 200, 'learning_rate': 0.07274464976639444, 'num_leaves': 86, 'max_depth': 3, 'min_child_samples': 63, 'subsample': 0.6204563314152839, 'colsample_bytree': 0.6754314569540577, 'reg_alpha': 0.007994509655092024, 'reg_lambda': 3.138200250800137}


[I 2025-05-14 14:36:21,688] Trial 16 finished with value: 0.34688387396004594 and parameters: {'n_estimators': 550, 'learning_rate': 0.035278452493391226, 'num_leaves': 134, 'max_depth': 5, 'min_child_samples': 100, 'subsample': 0.6652624381620467, 'colsample_bytree': 0.7060326985391507, 'reg_alpha': 0.09736880164662497, 'reg_lambda': 0.21259500328747996}. Best is trial 10 with value: 0.35097621989821665.


Trial 16 finished with avg NDCG@5: 0.3469 for params: {'n_estimators': 550, 'learning_rate': 0.035278452493391226, 'num_leaves': 134, 'max_depth': 5, 'min_child_samples': 100, 'subsample': 0.6652624381620467, 'colsample_bytree': 0.7060326985391507, 'reg_alpha': 0.09736880164662497, 'reg_lambda': 0.21259500328747996}


[I 2025-05-14 14:36:25,949] Trial 17 finished with value: 0.3369179974978604 and parameters: {'n_estimators': 300, 'learning_rate': 0.14478424379017577, 'num_leaves': 106, 'max_depth': 7, 'min_child_samples': 74, 'subsample': 0.7648492368159003, 'colsample_bytree': 0.5464300555743251, 'reg_alpha': 0.017800257736990985, 'reg_lambda': 0.03496888757423233}. Best is trial 10 with value: 0.35097621989821665.


Trial 17 finished with avg NDCG@5: 0.3369 for params: {'n_estimators': 300, 'learning_rate': 0.14478424379017577, 'num_leaves': 106, 'max_depth': 7, 'min_child_samples': 74, 'subsample': 0.7648492368159003, 'colsample_bytree': 0.5464300555743251, 'reg_alpha': 0.017800257736990985, 'reg_lambda': 0.03496888757423233}


[I 2025-05-14 14:36:29,667] Trial 18 finished with value: 0.35061364106352766 and parameters: {'n_estimators': 400, 'learning_rate': 0.09453914199038851, 'num_leaves': 120, 'max_depth': 4, 'min_child_samples': 33, 'subsample': 0.5236709081241935, 'colsample_bytree': 0.5620903563546057, 'reg_alpha': 0.0035082659900084233, 'reg_lambda': 0.4218739754022182}. Best is trial 10 with value: 0.35097621989821665.


Trial 18 finished with avg NDCG@5: 0.3506 for params: {'n_estimators': 400, 'learning_rate': 0.09453914199038851, 'num_leaves': 120, 'max_depth': 4, 'min_child_samples': 33, 'subsample': 0.5236709081241935, 'colsample_bytree': 0.5620903563546057, 'reg_alpha': 0.0035082659900084233, 'reg_lambda': 0.4218739754022182}


[I 2025-05-14 14:36:32,484] Trial 19 finished with value: 0.3488388570225307 and parameters: {'n_estimators': 350, 'learning_rate': 0.07442784086487549, 'num_leaves': 117, 'max_depth': 4, 'min_child_samples': 32, 'subsample': 0.5153028502978984, 'colsample_bytree': 0.7090245334690404, 'reg_alpha': 0.003506378672987832, 'reg_lambda': 0.44821600478665585}. Best is trial 10 with value: 0.35097621989821665.


Trial 19 finished with avg NDCG@5: 0.3488 for params: {'n_estimators': 350, 'learning_rate': 0.07442784086487549, 'num_leaves': 117, 'max_depth': 4, 'min_child_samples': 32, 'subsample': 0.5153028502978984, 'colsample_bytree': 0.7090245334690404, 'reg_alpha': 0.003506378672987832, 'reg_lambda': 0.44821600478665585}

Optuna study statistics:
  Number of finished trials: 20
  Best trial value (NDCG@5): 0.3510
  Best parameters found by Optuna:
    n_estimators: 250
    learning_rate: 0.19678085320205846
    num_leaves: 144
    max_depth: 3
    min_child_samples: 96
    subsample: 0.6005514711808235
    colsample_bytree: 0.5002926412539057
    reg_alpha: 0.003205944770438764
    reg_lambda: 5.229994576420763


In [19]:

# --- 7. Detailed Evaluation of Best Tuned Model ---
if 'X' in locals() and 'y' in locals() and df_sample is not None and X is not None and best_params_from_tuning:
    print("\n--- 7. Detailed Evaluation of Best Tuned Model (using GroupKFold) ---")

    final_gkf = GroupKFold(n_splits=N_FOLDS)
    final_fold_ndcg_scores = []
    final_all_feature_importances = pd.DataFrame()
    
    # Ensure unique_group_ids_for_splitting is available
    if 'unique_group_ids_for_splitting' not in locals():
        unique_group_ids_for_splitting = df_sample['srch_id']


    final_ranker_params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'label_gain': [0, 1, 5],
        'eval_at': [5],
        'importance_type': 'gain',
        'random_state': RANDOM_STATE,
        'n_jobs': -1,
    }
    # Update with tuned parameters, ensuring they are valid
    for key, value in best_params_from_tuning.items():
        final_ranker_params[key] = value
    
    # Ensure n_estimators is present if not tuned or set to a low value by tuning
    if 'n_estimators' not in final_ranker_params or final_ranker_params['n_estimators'] < 50:
         final_ranker_params['n_estimators'] = 300 # Default if not well-tuned by a short search

    print("\nFinal model parameters for evaluation:")
    print(final_ranker_params)

    for fold, (train_idx, val_idx) in enumerate(final_gkf.split(X, y, groups=unique_group_ids_for_splitting)):
        print(f"\n--- Final Model Evaluation: Fold {fold+1}/{N_FOLDS} ---")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        train_groups = df_sample.iloc[train_idx].groupby('srch_id').size().to_numpy()
        val_groups = df_sample.iloc[val_idx].groupby('srch_id').size().to_numpy()

        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, Num train groups: {len(train_groups)}")
        print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}, Num val groups: {len(val_groups)}")

        if len(train_groups) == 0 or len(val_groups) == 0 or X_train.empty or X_val.empty:
            print("Skipping fold due to empty train or validation groups/data.")
            continue
            
        final_ranker = lgb.LGBMRanker(**final_ranker_params)

        print(f"Training final tuned LGBMRanker for fold {fold+1}...")
        final_ranker.fit(
            X_train,
            y_train,
            group=train_groups,
            eval_set=[(X_val, y_val)],
            eval_group=[val_groups],
            eval_metric='ndcg',
            callbacks=[lgb.early_stopping(10, verbose=1)]
        )
        
        if final_ranker.evals_result_ and 'valid_0' in final_ranker.evals_result_ and 'ndcg@5' in final_ranker.evals_result_['valid_0']:
            final_ndcg_at_5 = final_ranker.evals_result_['valid_0']['ndcg@5'][-1]
            final_fold_ndcg_scores.append(final_ndcg_at_5)
            print(f"Fold {fold+1} (Tuned Model) NDCG@5: {final_ndcg_at_5:.4f}")

            fold_importances = pd.DataFrame({
                'feature': X_train.columns,
                'importance': final_ranker.feature_importances_,
                'fold': fold + 1
            })
            final_all_feature_importances = pd.concat([final_all_feature_importances, fold_importances], ignore_index=True)
        else:
            print(f"Could not retrieve NDCG@5 for fold {fold+1} of the tuned model.")


    if final_fold_ndcg_scores:
        print(f"\nMean NDCG@5 for Tuned Model across {len(final_fold_ndcg_scores)} successfully evaluated folds: {np.mean(final_fold_ndcg_scores):.4f} +/- {np.std(final_fold_ndcg_scores):.4f}")
        
        if not final_all_feature_importances.empty:
            final_mean_feature_importances = final_all_feature_importances.groupby('feature')['importance'].mean().sort_values(ascending=False)
            print("\nAverage Feature Importances (Tuned Model):")
            with pd.option_context('display.max_rows', 30):
                display(final_mean_feature_importances.head(20))
    else:
        print("No folds were successfully processed for the final tuned model evaluation.")
        
else:
    print("\nSkipping final model evaluation due to earlier issues or no tuned parameters found.")

# --- End of Notebook ---
# Next steps would involve:
# 1. More sophisticated feature engineering and selection.
# 2. Training the best model on the full (sampled) data or even a larger fraction.
# 3. Preparing the test data similarly.
# 4. Generating predictions for the test set.
# 5. Creating the Kaggle submission file.


--- 7. Detailed Evaluation of Best Tuned Model (using GroupKFold) ---

Final model parameters for evaluation:
{'objective': 'lambdarank', 'metric': 'ndcg', 'label_gain': [0, 1, 5], 'eval_at': [5], 'importance_type': 'gain', 'random_state': 42, 'n_jobs': -1, 'n_estimators': 250, 'learning_rate': 0.19678085320205846, 'num_leaves': 144, 'max_depth': 3, 'min_child_samples': 96, 'subsample': 0.6005514711808235, 'colsample_bytree': 0.5002926412539057, 'reg_alpha': 0.003205944770438764, 'reg_lambda': 5.229994576420763}

--- Final Model Evaluation: Fold 1/5 ---
X_train shape: (396968, 11), y_train shape: (396968,), Num train groups: 15983
X_val shape: (99244, 11), y_val shape: (99244,), Num val groups: 3996
Training final tuned LGBMRanker for fold 1...
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's ndcg@5: 0.351708
Fold 1 (Tuned Model) NDCG@5: 0.3492

--- Final Model Evaluation: Fold 2/5 ---
X_train shape: (396973, 11), y_train s

feature
prop_location_score2           8815.952521
price_usd                      5964.114531
prop_starrating                2783.595308
prop_location_score1           1967.285754
prop_review_score              1584.426287
prop_log_historical_price      1452.657886
promotion_flag                 1415.325307
orig_destination_distance       298.491524
prop_country_id                 152.657004
prop_brand_bool                  92.884254
visitor_location_country_id       0.000000
Name: importance, dtype: float64

In [20]:
# --- 8. Prepare Test Data and Generate Kaggle Submission ---

if 'df_sample' in locals() and df_sample is not None and \
   'X' in locals() and X is not None and \
   'best_params_from_tuning' in locals() and best_params_from_tuning:

    print("\n--- 8. Test Data Preparation and Kaggle Submission ---")
    DATA_DIR = '../data.nosync'

    # --- 8a. Load Test Data ---
    TEST_FILE = os.path.join(DATA_DIR, 'test.csv')
    print("Loading test data...")
    try:

        df_test = pd.read_csv(TEST_FILE)
        print(f"Loaded test dataset with shape: {df_test.shape}")
    except FileNotFoundError:
        print(f"Error: Test file not found at {TEST_FILE}")
        df_test = None
    except Exception as e:
        print(f"An error occurred during test data loading: {e}")
        df_test = None

    if df_test is not None:
        # --- 8b. Preprocess Test Data ---
        print("\nPreprocessing test data...")
        # Use the same feature_columns as defined for training
        if 'feature_columns' not in locals() or not feature_columns:
            print("Error: feature_columns not defined. Cannot preprocess test data.")
            df_test_processed = None
        else:
            print(f"Using feature columns: {feature_columns}")
            X_test = df_test[feature_columns].copy() # Ensure it's a copy

            # Impute missing values using medians from the TRAINING sample (X)
            # This is crucial to prevent data leakage.
            print("Imputing missing values in test data using training set medians...")
            for col in X_test.columns:
                if X_test[col].isnull().any():
                    if pd.api.types.is_numeric_dtype(X_test[col]):
                        # Get median from the original X (training sample before it was split into folds)
                        train_median = X[col].median() # X should be the full sample used for training/tuning
                        X_test[col] = X_test[col].fillna(train_median)
                        # print(f"Imputed missing values in test column {col} with training median: {train_median}")
                    # else: # For categorical, use mode from training set
                    #     train_mode = X[col].mode()[0]
                    #     X_test[col] = X_test[col].fillna(train_mode)
            
            print("NaNs remaining in X_test after imputation:", X_test.isnull().sum().sum())
            df_test_processed = True


        if df_test_processed:
            # --- 8c. Train Final Model on Full Sampled Data (df_sample) ---
            print("\nTraining final model on the full sampled training data (df_sample)...")
            
            # Parameters for the final model
            final_model_params = {
                'objective': 'lambdarank',
                'metric': 'ndcg',
                'label_gain': [0, 1, 5], # If using remapped (0,1,2) relevance, gain still [0,1,5]
                'eval_at': [5],
                'importance_type': 'gain',
                'random_state': RANDOM_STATE,
                'n_jobs': -1,
            }
            final_model_params.update(best_params_from_tuning)
            if 'n_estimators' not in final_model_params or final_model_params['n_estimators'] < 50:
                final_model_params['n_estimators'] = 300 # A reasonable default for early stopping
            
            print("Final model parameters for prediction model:")
            print(final_model_params)

            # Data for final model training
            X_full_sample = X # This is df_sample[feature_columns] with imputations
            y_full_sample = y # This is df_sample['relevance'] (remapped to 0,1,2 if you implemented that)
            groups_full_sample = df_sample.groupby('srch_id').size().to_numpy()

            final_model = lgb.LGBMRanker(**final_model_params)
            
            # For the final model, we can use a small portion of df_sample as an eval set for early stopping
            # This is better than no early stopping.
            temp_df_for_final_split = pd.DataFrame({
                'srch_id': df_sample['srch_id'],
                'index_orig': df_sample.index
            }).drop_duplicates(subset=['srch_id'])

            final_train_srch_ids, final_val_srch_ids = np.split(
                temp_df_for_final_split['srch_id'].sample(frac=1, random_state=RANDOM_STATE),
                [int(0.9 * len(temp_df_for_final_split))] # 90/10 split for final model's early stopping
            )
            
            final_train_indices = df_sample[df_sample['srch_id'].isin(final_train_srch_ids)].index
            final_val_indices = df_sample[df_sample['srch_id'].isin(final_val_srch_ids)].index

            X_final_train, X_final_val = X_full_sample.loc[final_train_indices], X_full_sample.loc[final_val_indices]
            y_final_train, y_final_val = y_full_sample.loc[final_train_indices], y_full_sample.loc[final_val_indices]
            
            groups_final_train = df_sample.loc[final_train_indices].groupby('srch_id').size().to_numpy()
            groups_final_val = df_sample.loc[final_val_indices].groupby('srch_id').size().to_numpy()

            if not X_final_val.empty and len(groups_final_val) > 0:
                 print(f"Fitting final model on {len(X_final_train)} samples, validating on {len(X_final_val)} samples.")
                 final_model.fit(
                    X_final_train, y_final_train, group=groups_final_train,
                    eval_set=[(X_final_val, y_final_val)],
                    eval_group=[groups_final_val],
                    eval_metric='ndcg',
                    callbacks=[lgb.early_stopping(10, verbose=1)]
                )
            else: # Fallback if validation set is too small or problematic
                print("Validation set for final model is empty/problematic, fitting on all sampled data without early stopping.")
                final_model_params.pop('eval_set', None) # Remove eval params if not using
                final_model_params.pop('eval_group', None)
                final_model_params.pop('eval_metric', None)
                final_model_params.pop('callbacks', None) # No early stopping
                # Ensure n_estimators is set to a fixed number if no early stopping
                final_model_params['n_estimators'] = best_params_from_tuning.get('n_estimators', 300) # Use tuned or default
                final_model = lgb.LGBMRanker(**final_model_params)
                final_model.fit(X_full_sample, y_full_sample, group=groups_full_sample)

            print("Final model training completed.")

            # --- 8d. Make Predictions on Test Data ---
            print("\nMaking predictions on the test set...")
            test_predictions = final_model.predict(X_test)
            df_test['predicted_score'] = test_predictions

            # --- 8e. Format Predictions for Submission ---
            print("\nFormatting predictions for submission...")
            submission_list = []
            # Group by srch_id and sort by predicted_score
            for srch_id, group_df in df_test.groupby('srch_id'):
                # Sort properties within each search by the predicted score in descending order
                ranked_properties = group_df.sort_values('predicted_score', ascending=False)
                for _, row in ranked_properties.iterrows():
                    submission_list.append({'srch_id': int(row['srch_id']), 'prop_id': int(row['prop_id'])})
            
            df_submission = pd.DataFrame(submission_list)

            # --- 8f. Create Submission File ---
            SUBMISSION_FILE = 'submission.csv'
            df_submission.to_csv(SUBMISSION_FILE, index=False)
            print(f"\nSubmission file '{SUBMISSION_FILE}' created successfully.")
            print(df_submission.head())

        else:
            print("Skipping prediction and submission due to test data processing issues.")
    else:
        print("Skipping submission generation due to test data loading issues.")
else:
    print("\nSkipping Kaggle submission part: Prerequisite data (df_sample, X, best_params_from_tuning) not available.")



--- 8. Test Data Preparation and Kaggle Submission ---
Loading test data...
Loaded test dataset with shape: (4959183, 50)

Preprocessing test data...
Using feature columns: ['visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'orig_destination_distance']
Imputing missing values in test data using training set medians...
NaNs remaining in X_test after imputation: 0

Training final model on the full sampled training data (df_sample)...
Final model parameters for prediction model:
{'objective': 'lambdarank', 'metric': 'ndcg', 'label_gain': [0, 1, 5], 'eval_at': [5], 'importance_type': 'gain', 'random_state': 42, 'n_jobs': -1, 'n_estimators': 250, 'learning_rate': 0.19678085320205846, 'num_leaves': 144, 'max_depth': 3, 'min_child_samples': 96, 'subsample': 0.6005514711808235, 'colsample_bytree': 0.5002926412539057, 'reg_alpha'

  return bound(*args, **kwds)


Fitting final model on 446495 samples, validating on 49717 samples.
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[63]	valid_0's ndcg@5: 0.345282
Final model training completed.

Making predictions on the test set...

Formatting predictions for submission...

Submission file 'submission.csv' created successfully.
   srch_id  prop_id
0        1    99484
1        1    54937
2        1    61934
3        1    28181
4        1    24194


In [None]:
'''
# --- 6. Hyperparameter Tuning with RandomizedSearchCV ---
if 'X' in locals() and 'y' in locals() and df_sample is not None and X is not None:
    print("\n--- 6. Hyperparameter Tuning with RandomizedSearchCV ---")
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint as sp_randint
    from scipy.stats import uniform as sp_uniform

    param_dist = {
        'n_estimators': sp_randint(100, 500),
        'learning_rate': sp_uniform(0.01, 0.19), # Upper bound <0.2 for uniform
        'num_leaves': sp_randint(20, 100),      
        'max_depth': sp_randint(3, 12),        
        'min_child_samples': sp_randint(5, 50), 
        'subsample': sp_uniform(0.6, 0.4),      # Sum of loc + scale should be <= 1.0. Here, 0.6 + 0.4 = 1.0
        'colsample_bytree': sp_uniform(0.6, 0.4), 
        'reg_alpha': sp_uniform(0, 1),          
        'reg_lambda': sp_uniform(0, 1),         
    }

    base_ranker = lgb.LGBMRanker(
        objective='lambdarank',
        metric='ndcg', # LightGBM will use this for its internal evaluation
        label_gain=[0, 1, 5],
        eval_at=[5],
        importance_type='gain',
        random_state=RANDOM_STATE,
        n_jobs=-1 # Be cautious with -1 for n_jobs in RandomizedSearchCV if memory is an issue
    )

    gkf_for_tuning = GroupKFold(n_splits=3) # Using 3 splits for tuning to speed it up

    # RandomizedSearchCV setup
    random_search = RandomizedSearchCV(
        estimator=base_ranker,
        param_distributions=param_dist,
        n_iter=10,  # Number of parameter settings sampled. Increase for more thorough search.
                    # Set to a small number like 5-10 for quick test, 25-50 for better search.
        cv=list(gkf_for_tuning.split(X, y, groups=df_sample['srch_id'])), # Pass the list of splits
        random_state=RANDOM_STATE,
        n_jobs=1, # Start with 1 to avoid potential memory issues, then try increasing.
        verbose=2,
        # scoring: If None, estimator's score method is used. LGBMRanker's score method should work.
        # It calculates NDCG@eval_at based on its parameters.
        refit=True # Refits the best estimator on the whole dataset (X,y) passed to fit.
                   # For ranking, this full dataset refit will also need group info.
    )

    print("Starting RandomizedSearchCV for hyperparameter tuning...")
    # Pass `groups` to `fit`. This will be used by GroupKFold inside RandomizedSearchCV.
    # And `LGBMRanker.fit` will also receive this `groups` argument for each fold.
    
    # For early stopping inside RandomizedSearchCV, you'd typically pass fit_params.
    # This is more complex because eval_set/eval_group change per fold.
    # For now, n_estimators is part of the search space.
    
    best_params_from_tuning = {}
    try:
        # RandomizedSearchCV will use the `groups` for splitting via the `cv` object
        # And `LGBMRanker.fit` will receive the `group` parameter for each fold.
        random_search.fit(X, y, groups=df_sample['srch_id']) 
        
        print("\nBest parameters found by RandomizedSearchCV:")
        print(random_search.best_params_)
        # The best_score_ will be based on the internal scoring of LGBMRanker (NDCG@5 here)
        print(f"Best score from RandomizedSearchCV (NDCG@5): {random_search.best_score_:.4f}")
        best_params_from_tuning = random_search.best_params_

    except Exception as e:
        print(f"Error during RandomizedSearchCV: {e}")
        print("Falling back to default parameters for the final model evaluation.")
        # Re-initialize with default in case of error
        best_params_from_tuning = { 
            'n_estimators': 100, 'learning_rate': 0.1, 'num_leaves': 31, 
            'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0,
            'colsample_bytree':1.0, 'reg_alpha':0.0, 'reg_lambda':0.0
        }


else:
    print("\nSkipping hyperparameter tuning due to earlier data processing issues.")
    best_params_from_tuning = {
        'n_estimators': 100, 'learning_rate': 0.1, 'num_leaves': 31, 
        'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0,
        'colsample_bytree':1.0, 'reg_alpha':0.0, 'reg_lambda':0.0
    }
'''