In [25]:
"""
Main Experiment Notebook for Expedia Hotel Booking Prediction

Assignment 2: Data Mining Techniques, Vrije Universiteit Amsterdam
"""

import pandas as pd
import numpy as np
import lightgbm as lgb # Keep for type hints if needed, but direct use will be less
from sklearn.model_selection import GroupKFold
import os

# Import the modularized model functions
import lightgbm_ranker_model as lgbm_model
import warnings # For managing warnings from Optuna/LightGBM if needed

# --- Configuration ---

DATA_DIR = '../data/'
TRAIN_FILE = os.path.join(DATA_DIR, 'training_set_VU_DM_feature_engin.csv')
TEST_FILE = os.path.join(DATA_DIR, 'test_set_VU_DM_feature_engin.csv') # Defined TEST_FILE path
SUBMISSION_FILENAME = '../data/submission_modular.csv'  # Defined submission filename

SAMPLE_FRACTION = 0.1 # Use 10% of the data for faster runs during development
N_FOLDS_CV = 5         # Number of folds for general cross-validation
N_FOLDS_TUNING = 3     # Number of folds for Optuna trials (can be smaller for speed)
N_OPTUNA_TRIALS = 20   # Number of Optuna trials
RANDOM_STATE = 42

# --- 1. Load Data ---
print("Loading training data...")
try:
    df_train_full = pd.read_csv(TRAIN_FILE)
except FileNotFoundError:
    print(f"ERROR: Training file not found at {TRAIN_FILE}")
    df_train_full = None
except Exception as e:
    print(f"Error loading training data: {e}")
    df_train_full = None

# --- 2. Create Relevance Score ---
if df_train_full is not None:
    print("\nCreating relevance score...")
    df_train_full['relevance'] = 0
    df_train_full.loc[df_train_full['click_bool'] == 1, 'relevance'] = 1
    df_train_full.loc[df_train_full['booking_bool'] == 1, 'relevance'] = 2 # Map to 0, 1, 2 for label_gain [0,1,5]
    print("Relevance score distribution:")
    print(df_train_full['relevance'].value_counts())
else:
    print("Skipping relevance score creation as df_train_full is None.")

# --- 3. Data Sampling (Group-aware) ---
df_sample = None
if df_train_full is not None:
    print(f"\nSampling {SAMPLE_FRACTION*100}% of the data based on srch_id...")
    unique_srch_ids = df_train_full['srch_id'].unique()
    if len(unique_srch_ids) > 0:
        sampled_srch_ids_count = int(len(unique_srch_ids) * SAMPLE_FRACTION)
        if sampled_srch_ids_count > 0:
            sampled_srch_ids = np.random.choice(unique_srch_ids, size=sampled_srch_ids_count, replace=False)
            df_sample = df_train_full[df_train_full['srch_id'].isin(sampled_srch_ids)].copy()
            print(f"Sampled data shape: {df_sample.shape}")
        else:
            print("Sample fraction resulted in zero search IDs. Check SAMPLE_FRACTION or dataset size.")
            df_sample = df_train_full.copy() # Fallback to full if sample is too small
            print(f"Using full dataset instead. Shape: {df_sample.shape}")
    else:
        print("No unique search IDs found in the training data.")
    del df_train_full # Optional: free up memory
else:
    print("Skipping sampling as df_train_full is None.")

Loading training data...

Creating relevance score...
Relevance score distribution:
relevance
0    4736468
2     138390
1      83489
Name: count, dtype: int64

Sampling 10.0% of the data based on srch_id...
Sampled data shape: (496233, 150)


In [26]:
df_sample

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,pca_component_11,pca_component_12,pca_component_13,pca_component_14,pca_component_15,pca_component_16,pca_component_17,pca_component_18,pca_component_19,pca_component_20
0,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,893,3,3.5,...,0.981932,-0.516538,-1.190681,-1.023253,0.182242,1.970431,7.004859,5.290167,3.927173,-2.013700
1,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,10404,4,4.0,...,0.988707,-0.425684,-1.321125,-1.039236,-0.338496,1.947880,7.236945,5.279294,3.560058,-1.476805
2,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,21315,3,4.5,...,0.850245,-0.955603,-0.539876,-0.977969,0.416192,2.070974,6.729867,5.022959,4.760550,-2.141155
3,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,27348,2,4.0,...,0.310608,-2.978943,2.130284,-0.935462,3.075464,4.726963,4.654473,6.934904,5.937608,0.821386
4,1,2013-04-04 08:32:15,12,187,0.0,0.0,219,29604,4,3.5,...,0.992220,-0.409590,-1.355921,-1.051223,-0.059442,2.176942,7.228963,5.356038,3.300087,-1.728475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958284,332781,2013-03-03 15:12:59,5,219,0.0,0.0,219,101770,2,2.5,...,-1.045460,-0.107420,0.072370,2.339506,0.356839,1.133789,0.236386,-0.097847,-0.967310,-0.802455
4958285,332781,2013-03-03 15:12:59,5,219,0.0,0.0,219,115831,2,3.0,...,-0.951428,-0.502378,0.505545,2.459365,0.630931,-0.582768,-0.137827,0.068793,0.329227,-0.510961
4958286,332781,2013-03-03 15:12:59,5,219,0.0,0.0,219,120379,2,4.5,...,-0.381578,0.634069,-0.552459,2.511653,-0.651917,-1.308794,0.194617,-0.747472,1.426646,-0.125837
4958287,332781,2013-03-03 15:12:59,5,219,0.0,0.0,219,132031,3,3.0,...,-0.546721,1.071254,-1.382976,2.340486,0.169470,-0.973796,0.200937,0.455543,-0.453089,0.159024


In [27]:
# --- 4. Initial Feature Selection & Preparation ---
X = None
y = None
groups_for_splitting = None # This will be df_sample['srch_id'] for GroupKFold.split
feature_columns = [] # Initialize

if df_sample is not None:
    print("\nDefining initial feature set and preparing X, y, groups...")
    feature_columns = [
        # Base features
        'visitor_location_country_id', 'prop_country_id',
        'prop_starrating', 'prop_review_score', 'prop_brand_bool',
        'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price',
        'price_usd', 'promotion_flag', 'orig_destination_distance',
        
        # Property features
        'hotel_quality', 'star_review_gap', 'combined_location_score', 
        'location_quality', 'distance_category', 'prop_popularity',
        'prop_country_rank_pct',
        
        # Price features
        'price_per_night', 'value_for_money_normalized', 'price_normalized',
        'price_rank_pct', 'price_tier', 'has_promotion', 'price_discount',
        
        # Competitive features
        'competitors_count', 'better_price_count', 'price_comp_ratio', 'comp_advantage',
        
        # User features
        'is_domestic_search', 'total_travelers', 'is_family',
        'is_short_stay', 'is_long_stay', 'is_last_minute', 'is_early_booking',
        
        # Time features
        'is_weekend', 'is_holiday_season',
        
        # Interaction features
        'user_prop_country_match', 'star_rating_for_price', 'review_for_price',
        'location_for_price', 'quality_price_ratio'
    ]

    # Ensure all selected feature columns exist in df_sample
    existing_feature_columns = [col for col in feature_columns if col in df_sample.columns]
    if len(existing_feature_columns) != len(feature_columns):
        print(f"Warning: Some feature columns not found. Using: {existing_feature_columns}")
    feature_columns = existing_feature_columns

    if not feature_columns:
        print("Error: No feature columns selected or available. Stopping.")
    else:
        X = df_sample[feature_columns].copy()
        y = df_sample['relevance'].copy()
        groups_for_splitting = df_sample['srch_id'] # Used by GroupKFold for splitting

        # Basic Imputation (should ideally be done based on EDA insights and training set stats)
        # This imputation is done on the *sampled* data (X).
        # For test set imputation later, medians from this X will be used.
        print("Performing basic median imputation for numerical features in X...")
        for col in X.columns:
            if X[col].isnull().any():
                if pd.api.types.is_numeric_dtype(X[col]):
                    median_val = X[col].median()
                    X[col].fillna(median_val, inplace=True)
                    # print(f"Imputed NaNs in '{col}' with median: {median_val}")
                # Add mode imputation for categorical if any, or use a placeholder string
        
        print(f"Selected {len(feature_columns)} features: {feature_columns}")
        print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")
        print(f"NaNs remaining in X after imputation: {X.isnull().sum().sum()}")
        if groups_for_splitting is not None:
            print(f"Number of unique groups for splitting: {groups_for_splitting.nunique()}")
else:
    print("Skipping feature selection as df_sample is None.")

# Display X's head to verify
if X is not None:
    display(X.head())


Defining initial feature set and preparing X, y, groups...
Performing basic median imputation for numerical features in X...
Selected 43 features: ['visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'orig_destination_distance', 'hotel_quality', 'star_review_gap', 'combined_location_score', 'location_quality', 'distance_category', 'prop_popularity', 'prop_country_rank_pct', 'price_per_night', 'value_for_money_normalized', 'price_normalized', 'price_rank_pct', 'price_tier', 'has_promotion', 'price_discount', 'competitors_count', 'better_price_count', 'price_comp_ratio', 'comp_advantage', 'is_domestic_search', 'total_travelers', 'is_family', 'is_short_stay', 'is_long_stay', 'is_last_minute', 'is_early_booking', 'is_weekend', 'is_holiday_season', 'user_prop_country_match', 'star_rating_for_price', 'review_for_price', 'locatio

Unnamed: 0,visitor_location_country_id,prop_country_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,price_usd,promotion_flag,...,is_long_stay,is_last_minute,is_early_booking,is_weekend,is_holiday_season,user_prop_country_match,star_rating_for_price,review_for_price,location_for_price,quality_price_ratio
0,187,219,3,3.5,1,2.83,0.0438,4.95,104.77,0,...,0,1,0,0,0,0,0.643602,0.750869,0.063706,17.335567
1,187,219,4,4.0,1,2.2,0.0149,5.03,170.74,0,...,0,1,0,0,0,0,0.777306,0.777306,0.043743,4.848542
2,187,219,3,4.5,1,2.2,0.0245,4.92,179.8,0,...,0,1,0,0,0,0,0.577213,0.865819,0.043864,4.109393
3,187,219,2,4.0,1,2.83,0.0125,4.39,602.77,0,...,0,1,0,0,0,0,0.312344,0.624688,0.044909,0.6
4,187,219,4,3.5,1,2.64,0.1241,4.93,143.58,0,...,0,1,0,0,0,0,0.804209,0.703683,0.060715,6.66638


In [28]:
# --- 5. Cross-Validation (using modular function) ---
import warnings
warnings.filterwarnings('ignore', message='Found \'eval_at\' in params.*')
mean_cv_ndcg = 0
std_cv_ndcg = 0
cv_feature_importances = pd.Series(dtype=float) # Initialize as an empty Series

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None:
    # Basic LGBM params for initial CV
    # These will be merged with/override defaults in the perform_cross_validation function
    initial_lgbm_params = {
        'n_estimators': 100, # Example: function's default might be different
        'learning_rate': 0.1, # Example
        'random_state': RANDOM_STATE
        # The modular function defines other necessary defaults like objective, metric, label_gain, eval_at etc.
    }
    
    print(f"\n--- Performing {N_FOLDS_CV}-Fold Cross-Validation using modular function ---")
    mean_cv_ndcg, std_cv_ndcg, cv_feature_importances = lgbm_model.perform_cross_validation(
        X, y, 
        groups_for_splitting=groups_for_splitting, # This is df_sample['srch_id']
        df_full_for_group_counts=df_sample, # Pass df_sample, as it contains 'srch_id' needed for group counts
        n_folds=N_FOLDS_CV,
        lgbm_params=initial_lgbm_params
    )
    
    print(f"\nCross-Validation Mean NDCG@5: {mean_cv_ndcg:.4f} +/- {std_cv_ndcg:.4f}")
    if not cv_feature_importances.empty:
        print("\nAverage Feature Importances from CV:")
        with pd.option_context('display.max_rows', 30): # Display top 20 or all if less than 20
            display(cv_feature_importances.head(20))
else:
    print("\nSkipping Cross-Validation due to missing X, y, groups_for_splitting, or df_sample.")


--- Performing 5-Fold Cross-Validation using modular function ---
\n--- Performing 5-Fold Cross-Validation ---
--- Fold 1/5 ---
Fold 1 NDCG@5: 0.3592
--- Fold 2/5 ---
Fold 2 NDCG@5: 0.3499
--- Fold 3/5 ---
Fold 3 NDCG@5: 0.3449
--- Fold 4/5 ---
Fold 4 NDCG@5: 0.3526
--- Fold 5/5 ---
Fold 5 NDCG@5: 0.3569
Mean NDCG@5 across 5 folds: 0.3527 +/- 0.0051

Cross-Validation Mean NDCG@5: 0.3527 +/- 0.0051

Average Feature Importances from CV:


feature
prop_location_score2         17166.577920
price_discount                6278.243007
price_rank_pct                5211.863006
prop_location_score1          5116.179572
prop_country_rank_pct         4820.261888
star_rating_for_price         4236.158121
quality_price_ratio           3092.458729
promotion_flag                2815.662189
prop_review_score             2704.097573
price_usd                     2187.234822
prop_popularity               1845.780967
combined_location_score       1449.318458
price_normalized              1437.484595
review_for_price              1391.433720
prop_log_historical_price     1164.376938
location_for_price            1046.999037
price_per_night                956.641599
orig_destination_distance      922.797661
comp_advantage                 475.334661
prop_starrating                444.431342
Name: importance, dtype: float64

In [29]:
# --- 6. Hyperparameter Tuning with Optuna (using modular function) ---
best_params_from_tuning = {} # Initialize

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None:
    print(f"\n--- Tuning Hyperparameters with Optuna ({N_OPTUNA_TRIALS} trials, {N_FOLDS_TUNING} CV folds each) using modular function ---")
    
    # Suppress Optuna's verbosity if it's too much, and LightGBM warnings during tuning.
    # import optuna # Optuna is imported within lgbm_model.py where tune_hyperparameters_optuna is defined.
    # optuna.logging.set_verbosity(optuna.logging.WARNING) # You can set this in lgbm_model.py if desired globally for the function
    
    # It's good practice to manage warnings that might clutter the output during tuning.
    # The lgbm_model.py file could also handle these internally if preferred.
    warnings.filterwarnings('ignore', message='Found \'eval_at\' in params.*') # Suppress LightGBM's specific warning
    warnings.filterwarnings('ignore', message='Overriding the init_model argument.*') # Another potential LightGBM warning

    best_params_from_tuning = lgbm_model.tune_hyperparameters_optuna(
        X, 
        y, 
        groups_for_splitting=groups_for_splitting, # This is df_sample['srch_id']
        df_full_for_group_counts=df_sample, # df_sample for calculating group sizes within folds
        n_trials=N_OPTUNA_TRIALS, 
        n_cv_folds=N_FOLDS_TUNING
    )
    
    if best_params_from_tuning:
        print("\nBest parameters found by Optuna:")
        for key, value in best_params_from_tuning.items():
            print(f"    {key}: {value}")
    else:
        print("\nOptuna tuning did not return parameters. Using default parameters for the final model evaluation.")
        # Fallback to some sensible defaults if tuning fails or is skipped
        best_params_from_tuning = { 
            'n_estimators': 200, 'learning_rate': 0.05, 'num_leaves': 31, 
            'max_depth': 7, 'min_child_samples': 20, 'subsample': 0.8,
            'colsample_bytree':0.8, 'reg_alpha':0.1, 'reg_lambda':0.1
            # Add other necessary LGBM parameters if not covered by the module's defaults
        } 
else:
    print("\nSkipping Hyperparameter Tuning due to missing X, y, groups_for_splitting, or df_sample.")
    # Fallback parameters if tuning is skipped
    best_params_from_tuning = { 
        'n_estimators': 200, 'learning_rate': 0.05, 'num_leaves': 31, 
        'max_depth': 7, 'min_child_samples': 20, 'subsample': 0.8,
        'colsample_bytree':0.8, 'reg_alpha':0.1, 'reg_lambda':0.1
    }

# Reset warnings to default behavior if they were changed
warnings.resetwarnings()

[I 2025-05-18 23:06:21,824] A new study created in memory with name: lgbm_ranker_tuning



--- Tuning Hyperparameters with Optuna (20 trials, 3 CV folds each) using modular function ---
\n--- Tuning Hyperparameters with Optuna (20 trials, 3 CV folds each) ---


[I 2025-05-18 23:06:28,625] Trial 0 finished with value: 0.3485648708465034 and parameters: {'n_estimators': 600, 'learning_rate': 0.0383392091560084, 'num_leaves': 128, 'max_depth': 3, 'min_child_samples': 74, 'subsample': 0.7453794190423586, 'colsample_bytree': 0.8731285867855868, 'reg_alpha': 0.09468407076718369, 'reg_lambda': 1.1011701308126212}. Best is trial 0 with value: 0.3485648708465034.
[I 2025-05-18 23:06:35,843] Trial 1 finished with value: 0.3473504476712286 and parameters: {'n_estimators': 100, 'learning_rate': 0.022596638359231994, 'num_leaves': 92, 'max_depth': 3, 'min_child_samples': 5, 'subsample': 0.9691376826312474, 'colsample_bytree': 0.8593038865950324, 'reg_alpha': 1.5904990786068438, 'reg_lambda': 3.075669681051407}. Best is trial 0 with value: 0.3485648708465034.
[I 2025-05-18 23:06:43,393] Trial 2 finished with value: 0.3542603381842788 and parameters: {'n_estimators': 600, 'learning_rate': 0.09678351755889147, 'num_leaves': 61, 'max_depth': 4, 'min_child_sam

Optuna study finished. Best trial NDCG@5: 0.3543
Best parameters: {'n_estimators': 600, 'learning_rate': 0.09678351755889147, 'num_leaves': 61, 'max_depth': 4, 'min_child_samples': 82, 'subsample': 0.9300089804461755, 'colsample_bytree': 0.9366796867622699, 'reg_alpha': 0.008756302386909708, 'reg_lambda': 0.0034301129197774425}

Best parameters found by Optuna:
    n_estimators: 600
    learning_rate: 0.09678351755889147
    num_leaves: 61
    max_depth: 4
    min_child_samples: 82
    subsample: 0.9300089804461755
    colsample_bytree: 0.9366796867622699
    reg_alpha: 0.008756302386909708
    reg_lambda: 0.0034301129197774425


In [30]:
# --- 7. Train Final Model on Full Sampled Data (using modular function) ---
final_trained_model = None

if X is not None and y is not None and groups_for_splitting is not None and df_sample is not None and best_params_from_tuning:
    print("\\n--- Training Final Model with Best Tuned Parameters ---")
    
    # groups_train_full is needed for the lgbm_model.train_final_model function
    # It should represent the group sizes for the entire X, y that's being passed
    # This X is df_sample[feature_columns]
    groups_train_full = df_sample.groupby('srch_id').size().to_numpy()

    if len(groups_train_full) > 0:
        final_trained_model = lgbm_model.train_final_model(
            X_train_full=X,  # This is the full X from df_sample
            y_train_full=y,  # This is the full y from df_sample
            groups_train_full=groups_train_full,
            df_full_for_group_counts=df_sample, # df_sample contains 'srch_id' for early stopping split
            best_params=best_params_from_tuning
        )
        if final_trained_model:
            print("Final model successfully trained.")
        else:
            print("Final model training failed or returned None.")
    else:
        print("Cannot train final model: No groups found in the training data.")
else:
    print("\\nSkipping final model training due to missing data, groups, or best_params_from_tuning.")


\n--- Training Final Model with Best Tuned Parameters ---
\n--- Training Final Model ---
Final model parameters for training:
{'objective': 'lambdarank', 'metric': 'ndcg', 'label_gain': [0, 1, 5], 'eval_at': [5], 'importance_type': 'gain', 'random_state': 42, 'n_jobs': -1, 'verbosity': -1, 'n_estimators': 600, 'learning_rate': 0.09678351755889147, 'num_leaves': 61, 'max_depth': 4, 'min_child_samples': 82, 'subsample': 0.9300089804461755, 'colsample_bytree': 0.9366796867622699, 'reg_alpha': 0.008756302386909708, 'reg_lambda': 0.0034301129197774425}


  return bound(*args, **kwds)


Fitting final model with early stopping on 90/10 split of training data.




Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[69]	valid_0's ndcg@5: 0.364245
Final model training completed.
Final model successfully trained.


In [33]:
# --- 8. Prepare Test Data and Generate Kaggle Submission (using modular function) ---

if final_trained_model is not None and X is not None and 'feature_columns' in locals() and feature_columns:
    print("\\n--- Preparing Test Data and Generating Kaggle Submission ---")

    # --- 8a. Load Test Data ---
    print(f"Loading test data from: {TEST_FILE}...")
    try:
        df_test_raw = pd.read_csv(TEST_FILE)
        print(f"Loaded test dataset with shape: {df_test_raw.shape}")
    except FileNotFoundError:
        print(f"ERROR: Test file not found at {TEST_FILE}")
        df_test_raw = None
    except Exception as e:
        print(f"Error loading test data: {e}")
        df_test_raw = None

    if df_test_raw is not None:
        # --- 8b. Preprocess Test Data ---
        print("\\nPreprocessing test data...")
        
        # Ensure all selected feature columns exist in df_test_raw
        # and handle any missing columns gracefully if necessary (e.g. by creating them with NaNs)
        X_test_list = []
        for col in feature_columns:
            if col not in df_test_raw.columns:
                print(f"Warning: Feature column '{col}' not found in test data. Creating it with NaNs.")
                df_test_raw[col] = np.nan 
        
        X_test = df_test_raw[feature_columns].copy()

        # Impute missing values in X_test using medians from the TRAINING sample (X)
        # X should be the dataframe of features used for training the final_trained_model
        print("Imputing missing values in test data using training set medians...")
        nan_counts_before_imputation = X_test.isnull().sum()

        for col in X_test.columns:
            if X_test[col].isnull().any():
                if pd.api.types.is_numeric_dtype(X_test[col]):
                    if col in X.columns: # Ensure the column exists in the training features X
                        train_median = X[col].median() # Calculate median from the TRAIN features (X)
                        X_test[col].fillna(train_median, inplace=True)
                        # print(f"Imputed NaNs in test column '{col}' with training median: {train_median}")
                    else:
                        print(f"Warning: Column '{col}' for median imputation not found in training X. Test NaNs may remain.")
                # else: # For categorical, use mode from training X
                    # if col in X.columns:
                    #     train_mode = X[col].mode()[0]
                    #     X_test[col].fillna(train_mode, inplace=True)
                    # else:
                    #     print(f"Warning: Column '{col}' for mode imputation not found in training X. Test NaNs may remain.")
        
        nan_counts_after_imputation = X_test.isnull().sum().sum()
        print(f"NaNs remaining in X_test after imputation: {nan_counts_after_imputation}")
        if nan_counts_after_imputation > 0:
            print("Warning: Some NaNs remain in test features after imputation. Review missing columns or imputation logic.")
            print(X_test.isnull().sum()[X_test.isnull().sum() > 0])


        # --- 8c. Generate Submission File ---
        # The df_test_raw contains 'srch_id' and 'prop_id' needed by the submission function
        lgbm_model.predict_and_format_submission(
            model=final_trained_model,
            X_test=X_test,
            df_test_original_ids=df_test_raw, # Pass the raw test df for srch_id and prop_id
            submission_filename=SUBMISSION_FILENAME
        )
    else:
        print("Skipping submission generation as test data could not be loaded.")
else:
    print("\\nSkipping Kaggle submission: final_trained_model, X, or feature_columns not available.")


\n--- Preparing Test Data and Generating Kaggle Submission ---
Loading test data from: ../data/test_set_VU_DM_feature_engin.csv...
Loaded test dataset with shape: (4959183, 146)
\nPreprocessing test data...
Imputing missing values in test data using training set medians...
NaNs remaining in X_test after imputation: 0
\n--- Predicting on Test Data and Formatting Submission ---




Submission file 'submission_modular.csv' created. Top 5 rows:
   srch_id  prop_id
0        1    99484
1        1    61934
2        1    54937
3        1    24194
4        1    28181
