# Train on the entire dataset (without sampling)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import time

In [2]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all" 

In [3]:
# Load cleaned sample
df = pd.read_csv('yellowcab30w.csv')
feature_cols = [
    'is_peak_hour', 'is_night_hour', 'PU_is_midtown', 'PU_is_uptown',
    'PU_is_downtown', 'DO_is_midtown', 'DO_is_uptown', 'DO_is_downtown',
    'vendor_2', 'is_weekend', 'trip_distance',
    'fare_amount', 'extra', 'tolls_amount',
    'congestion_surcharge', 'airport_fee', 
]

target_col = 'tip_amount'

df_new = df[df["tip_amount"]>0]
q = df_new[target_col].quantile(0.98)
df_new = df_new[df_new[target_col] <= q]

# Assuming df_new has already been created: (filtered with tip_amount > 0 and <= 99th percentile)
X = df_new[feature_cols]
y = df_new[target_col]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Hyperparater space
max_depth_list = [4, 5, 6, 7, 8]
learning_rate_list = [0.01, 0.02, 0.05, 0.1, 0.2]
n_estimators_list = [100, 200, 300, 400, 800]

In [6]:
import time
import random
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import itertools

# Define hyperparameter search space
max_depth_list = [4, 5, 6, 7, 8]
learning_rate_list = [0.01, 0.02, 0.05, 0.1, 0.2]
n_estimators_list = [100, 200, 300, 400, 500]

# Define objectives to try
objective_list = ['reg:squarederror', 'reg:squaredlogerror']

# Settings
n_random_trials = 20  # Number of random trials per objective
early_stopping_rounds = 20  # Early stopping patience

# Precompute all hyperparameter combinations
param_grid = list(itertools.product(max_depth_list, learning_rate_list, n_estimators_list))

# Dictionary to store best results for each objective
random_search_results = {}

# ====== Start total timing ======
start_time_total = time.time()

for objective_function in objective_list:
    print(f"\n===== Searching for Objective: {objective_function} =====")

    # Randomly sample n_random_trials unique hyperparameter combinations
    sampled_combinations = random.sample(param_grid, n_random_trials)

    best_r2 = -np.inf
    best_rmse = np.inf
    best_params = None

    for trial_idx, (max_depth, learning_rate, n_estimators) in enumerate(sampled_combinations):
        start_time_trial = time.time()

        model = xgb.XGBRegressor(
            objective=objective_function,
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42,
            n_jobs=-1,
            tree_method='hist',
            subsample=0.8,
            colsample_bytree=0.8,
            early_stopping_rounds=early_stopping_rounds
        )

        # Train the model
        _ = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

        # Predict
        y_pred = model.predict(X_test)

        # Evaluate
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)

        end_time_trial = time.time()
        loop_time = end_time_trial - start_time_trial

        # Output trial result
        print(f"Objective={objective_function} | Trial {trial_idx+1}/{n_random_trials} | max_depth={max_depth}, lr={learning_rate}, n_estimators={n_estimators} | RMSE={rmse:.4f}, R²={r2:.4f} | Loop time={loop_time:.2f} seconds")

        # Update best result if improved
        if r2 > best_r2:
            best_r2 = r2
            best_rmse = rmse
            best_params = (max_depth, learning_rate, n_estimators)

    # Save best result for this objective
    random_search_results[objective_function] = (best_r2, best_rmse, best_params)

    print(f"\n>>> Best Parameters for {objective_function}: max_depth={best_params[0]}, learning_rate={best_params[1]}, n_estimators={best_params[2]}")
    print(f">>> Best Avg R² for {objective_function}: {best_r2:.4f}")

# ====== End total timing ======
end_time_total = time.time()
total_time = end_time_total - start_time_total

print(f"\n===== Total Random Search Time: {total_time:.2f} seconds =====")


===== Searching for Objective: reg:squarederror =====
Objective=reg:squarederror | Trial 1/20 | max_depth=8, lr=0.2, n_estimators=500 | RMSE=1.5224, R²=0.7208 | Loop time=0.70 seconds
Objective=reg:squarederror | Trial 2/20 | max_depth=5, lr=0.05, n_estimators=100 | RMSE=1.5192, R²=0.7220 | Loop time=1.44 seconds
Objective=reg:squarederror | Trial 3/20 | max_depth=7, lr=0.05, n_estimators=200 | RMSE=1.5165, R²=0.7230 | Loop time=2.04 seconds
Objective=reg:squarederror | Trial 4/20 | max_depth=8, lr=0.05, n_estimators=100 | RMSE=1.5182, R²=0.7223 | Loop time=1.93 seconds
Objective=reg:squarederror | Trial 5/20 | max_depth=7, lr=0.2, n_estimators=400 | RMSE=1.5174, R²=0.7226 | Loop time=0.74 seconds
Objective=reg:squarederror | Trial 6/20 | max_depth=7, lr=0.1, n_estimators=500 | RMSE=1.5163, R²=0.7230 | Loop time=1.20 seconds
Objective=reg:squarederror | Trial 7/20 | max_depth=5, lr=0.2, n_estimators=500 | RMSE=1.5195, R²=0.7219 | Loop time=0.82 seconds
Objective=reg:squarederror | Tri