In [17]:
# Import required libraries
import pandas as pd
import numpy as np

import warnings

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score

import lightgbm as lgb

# Configure visualizations
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('data/clean_train.csv')

## Convert date to numeric format

In [20]:
df['date_time'] = pd.to_datetime(df['date_time'])
df['month'] = df['date_time'].dt.month
df = df.drop(columns=['date_time'])

## Splitting 

In [21]:
train_data = df.copy()

In [22]:
# Features
X = train_data.drop(columns=['booking_bool'])  # Drop target 

# Target
y = train_data['booking_bool']  # Target variable (booking_bool)

In [23]:
# Split the data (70% train, 30% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

## Setting up model

In [13]:
# Initialize the model
lgbm_model = lgb.LGBMClassifier(objective='binary', metric='binary_error')

## Hyperparameter Tuning

In [None]:
# Hyperparameter grid
param_grid = {
    'num_leaves': [12, 20, 30],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [1000, 2000, 3000]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

# Update model with best hyperparameters
lgbm_best = grid_search.best_estimator_

In [None]:
lgbm_model = lgbm_best

## Training

In [24]:
# Train the best model on the training data
lgbm_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 96873, number of negative: 3373969
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.151078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3948
[LightGBM] [Info] Number of data points in the train set: 3470842, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027911 -> initscore=-3.550444
[LightGBM] [Info] Start training from score -3.550444


## Evaluation

In [25]:
# Make predictions on the validation set
y_val_pred = lgbm_model.predict(X_val)

# Calculate accuracy and AUC score
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_pred)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC Score: {roc_auc}")

Accuracy: 0.9938749785715006
ROC AUC Score: 0.9694774672957566


## Generate Predictions for the Test Set

In [None]:
# Assuming test_data has the same structure as train_data
test_data = pd.read_csv('test.csv')

# Preprocess the test data in the same way as train data
# (e.g., fill missing values, feature engineering)

# Predict probabilities of booking for each property
y_test_pred_prob = lgbm_best.predict_proba(test_data)[:, 1]

In [None]:
# Add predicted probabilities to test data
test_data['booking_probability'] = y_test_pred_prob

# Sort by srch_id and booking_probability
sorted_predictions = test_data[['srch_id', 'prop_id', 'booking_probability']] \
    .sort_values(by=['srch_id', 'booking_probability'], ascending=[True, False])

# Extract only srch_id and prop_id for output
output = sorted_predictions[['srch_id', 'prop_id']]

# Save to CSV
output.to_csv('submission.csv', index=False)