In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import time

In [3]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all" 

In [4]:
# Load cleaned sample
df = pd.read_csv('yellowcab3w.csv')
feature_cols = [
    'is_peak_hour', 'is_night_hour', 'PU_is_midtown', 'PU_is_uptown',
    'PU_is_downtown', 'DO_is_midtown', 'DO_is_uptown', 'DO_is_downtown',
    'vendor_2', 'is_weekend', 'trip_distance',
    'fare_amount', 'extra', 'tolls_amount',
    'congestion_surcharge', 'airport_fee', 
]

target_col = 'tip_amount'

df_new = df[df["tip_amount"]>0]
q = df_new[target_col].quantile(0.98)
df_new = df_new[df_new[target_col] <= q]

# Assuming df_new has already been created: (filtered with tip_amount > 0 and <= 99th percentile)
X = df_new[feature_cols]
y = df_new[target_col]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Hyperparater space
max_depth_list = [4, 5, 6, 7, 8]
learning_rate_list = [0.01, 0.02, 0.05, 0.1, 0.2]
n_estimators_list = [100, 200, 300, 400, 800]

## Selected baseline model (built in) for comparison

In [7]:
# Objectives names (built in)
objective_list = [
    'reg:squarederror', 
    'reg:squaredlogerror', 
]

# Start profiling
start_time_total = time.time()
objective_time_summary = {}

# Outer loop: over different objectives
for objective_function in objective_list:
    print(f"\nSearching with Objective: {objective_function}")

    best_r2 = -np.inf
    best_params = None

    total_time_obj = 0
    total_runs = 0

    for max_depth in max_depth_list:
        for learning_rate in learning_rate_list:
            for n_estimators in n_estimators_list:
                start_time_loop = time.time()

                model = xgb.XGBRegressor(
                    objective=objective_function,
                    n_estimators=n_estimators,
                    max_depth=max_depth,
                    learning_rate=learning_rate,
                    random_state=42
                )
                _ = model.fit(X_train, y_train)
                
                y_pred = model.predict(X_test)

                rmse = mean_squared_error(y_test, y_pred, squared=False)
                r2 = r2_score(y_test, y_pred)

                duration = time.time() - start_time_loop
                total_time_obj += duration
                total_runs += 1

                print(f"Objective={objective_function} | max_depth={max_depth}, lr={learning_rate}, n_estimators={n_estimators} | RMSE={rmse:.4f}, R²={r2:.4f} | Loop time={duration:.2f} seconds")

                if r2 > best_r2:
                    best_r2 = r2
                    best_params = (max_depth, learning_rate, n_estimators)

    avg_time = total_time_obj / total_runs
    objective_time_summary[objective_function] = avg_time

    # print(f"\nBest Parameters for {objective_function}: max_depth={best_params[0]}, learning_rate={best_params[1]}, n_estimators={best_params[2]}")
    # print(f"Best R² for {objective_function}: {best_r2:.4f}")
    print(f"Total training time for {objective_function}: {total_time_obj:.2f} seconds")
    # print(f"Average training time for {objective_function}: {avg_time:.2f} seconds")

end_time_total = time.time()
print(f"\nTotal search time for all objectives: {end_time_total - start_time_total:.2f} seconds")



Searching with Objective: reg:squarederror
Objective=reg:squarederror | max_depth=4, lr=0.01, n_estimators=100 | RMSE=1.8245, R²=0.5989 | Loop time=0.17 seconds
Objective=reg:squarederror | max_depth=4, lr=0.01, n_estimators=200 | RMSE=1.6315, R²=0.6793 | Loop time=0.30 seconds
Objective=reg:squarederror | max_depth=4, lr=0.01, n_estimators=300 | RMSE=1.6041, R²=0.6899 | Loop time=0.40 seconds
Objective=reg:squarederror | max_depth=4, lr=0.01, n_estimators=400 | RMSE=1.6034, R²=0.6902 | Loop time=0.73 seconds
Objective=reg:squarederror | max_depth=4, lr=0.01, n_estimators=800 | RMSE=1.6080, R²=0.6884 | Loop time=0.85 seconds
Objective=reg:squarederror | max_depth=4, lr=0.02, n_estimators=100 | RMSE=1.6310, R²=0.6794 | Loop time=0.14 seconds
Objective=reg:squarederror | max_depth=4, lr=0.02, n_estimators=200 | RMSE=1.6028, R²=0.6904 | Loop time=0.26 seconds
Objective=reg:squarederror | max_depth=4, lr=0.02, n_estimators=300 | RMSE=1.6055, R²=0.6894 | Loop time=0.35 seconds
Objective=re

## Implement random search of hyperparameters and conduct early stopping to accelerate loop

In [9]:
import time
import random
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# ====== Hyperparameters search space ======
max_depth_list = [4, 5, 6, 7, 8]
learning_rate_list = [0.01, 0.02, 0.05, 0.1, 0.2]
n_estimators_list = [100, 200, 300, 400, 500]

n_random_trials = 20        # Number of random trials per objective

# List of objective functions to try
objective_list = ['reg:squarederror', 
                  'reg:squaredlogerror']

# Dictionary to store best results per objective
random_search_results = {}

# ====== Start total timing ======
start_time_total = time.time()

for objective_function in objective_list:
    print(f"\n===== Searching for Objective: {objective_function} =====")
    
    best_r2 = -np.inf
    best_rmse = np.inf
    best_params = None

    total_time_obj = 0

    for trial in range(n_random_trials):
        start_time_trial = time.time()

        # Randomly sample hyperparameters
        max_depth = random.choice(max_depth_list)
        learning_rate = random.choice(learning_rate_list)
        n_estimators = random.choice(n_estimators_list)

        model = xgb.XGBRegressor(
            objective=objective_function,
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42,
            n_jobs=-1
            # No early_stopping_rounds here
        )

        # Fit the model (no early stopping)
        _ = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

        # Predict
        y_pred = model.predict(X_test)

        # Evaluate
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)

        end_time_trial = time.time()
        loop_time = end_time_trial - start_time_trial

        total_time_obj += loop_time

        # Output trial result
        # print(f"Objective={objective_function} | Trial {trial+1}/{n_random_trials} | max_depth={max_depth}, lr={learning_rate}, n_estimators={n_estimators} | RMSE={rmse:.4f}, R²={r2:.4f} | Loop time={loop_time:.2f} seconds")

        # Update best result if improved
        if r2 > best_r2:
            best_r2 = r2
            best_rmse = rmse
            best_params = (max_depth, learning_rate, n_estimators)

    # Save the best for this objective
    random_search_results[objective_function] = (best_r2, best_rmse, best_params)
    print(f"Total training time for {objective_function}: {total_time_obj:.2f} seconds")
    # print(f"\n>>> Best Parameters for {objective_function}: max_depth={best_params[0]}, learning_rate={best_params[1]}, n_estimators={best_params[2]}")
    # print(f">>> Best Avg R² for {objective_function}: {best_r2:.4f}")

# ====== End total timing ======
end_time_total = time.time()
total_time = end_time_total - start_time_total

print(f"\n===== Total Random Search Time: {total_time:.2f} seconds =====")



===== Searching for Objective: reg:squarederror =====
Total training time for reg:squarederror: 13.43 seconds

===== Searching for Objective: reg:squaredlogerror =====
Total training time for reg:squaredlogerror: 17.44 seconds

===== Total Random Search Time: 30.87 seconds =====


## Use hist method

In [11]:
import time
import random
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# ====== Hyperparameters search space ======
max_depth_list = [4, 5, 6, 7, 8]
learning_rate_list = [0.01, 0.02, 0.05, 0.1, 0.2]
n_estimators_list = [100, 200, 300, 400, 500]

n_random_trials = 20  # Number of random trials per objective

# List of objective functions to try
objective_list = ['reg:squarederror', 'reg:squaredlogerror']

# Dictionary to store best results per objective
random_search_results = {}

# ====== Start total timing ======
start_time_total = time.time()

for objective_function in objective_list:
    print(f"\n===== Searching for Objective: {objective_function} =====")
    
    best_r2 = -np.inf
    best_rmse = np.inf
    best_params = None

    total_time_obj = 0  # <-- 初始化每个objective的计时器

    for trial in range(n_random_trials):
        start_time_trial = time.time()

        # Randomly sample hyperparameters
        max_depth = random.choice(max_depth_list)
        learning_rate = random.choice(learning_rate_list)
        n_estimators = random.choice(n_estimators_list)

        model = xgb.XGBRegressor(
            objective=objective_function,
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42,
            n_jobs=-1,
            tree_method='hist',
            early_stopping_rounds=30
        )

        # Fit the model
        _ = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

        # Predict
        y_pred = model.predict(X_test)

        # Evaluate
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)

        end_time_trial = time.time()
        loop_time = end_time_trial - start_time_trial

        total_time_obj += loop_time  # <-- 每次加上loop时间

        # Output trial result
        # print(f"Objective={objective_function} | Trial {trial+1}/{n_random_trials} | max_depth={max_depth}, lr={learning_rate}, n_estimators={n_estimators} | RMSE={rmse:.4f}, R²={r2:.4f} | Loop time={loop_time:.2f} seconds")

        # Update best result if improved
        if r2 > best_r2:
            best_r2 = r2
            best_rmse = rmse
            best_params = (max_depth, learning_rate, n_estimators)

    # Save the best for this objective
    random_search_results[objective_function] = (best_r2, best_rmse, best_params)

    # 打印这个 objective 的总耗时
    print(f"\n>>> Total training time for {objective_function}: {total_time_obj:.2f} seconds")
    print(f">>> Best Parameters: max_depth={best_params[0]}, learning_rate={best_params[1]}, n_estimators={best_params[2]}")
    print(f">>> Best Avg R²: {best_r2:.4f}")

# ====== End total timing ======
end_time_total = time.time()
total_time = end_time_total - start_time_total

print(f"\n===== Total Random Search Time for all objectives: {total_time:.2f} seconds =====")


===== Searching for Objective: reg:squarederror =====

>>> Total training time for reg:squarederror: 8.82 seconds
>>> Best Parameters: max_depth=4, learning_rate=0.2, n_estimators=200
>>> Best Avg R²: 0.6919

===== Searching for Objective: reg:squaredlogerror =====

>>> Total training time for reg:squaredlogerror: 11.52 seconds
>>> Best Parameters: max_depth=5, learning_rate=0.02, n_estimators=500
>>> Best Avg R²: 0.6747

===== Total Random Search Time for all objectives: 20.34 seconds =====


## Use itertools

In [15]:
import time
import random
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import itertools

# Define hyperparameter search space
max_depth_list = [4, 5, 6, 7, 8]
learning_rate_list = [0.01, 0.02, 0.05, 0.1, 0.2]
n_estimators_list = [100, 200, 300, 400, 500]

# Define objectives to try
objective_list = ['reg:squarederror', 'reg:squaredlogerror']

# Settings
n_random_trials = 20  # Number of random trials per objective
early_stopping_rounds = 20  # Early stopping patience

# Precompute all hyperparameter combinations
param_grid = list(itertools.product(max_depth_list, learning_rate_list, n_estimators_list))

# Dictionary to store best results for each objective
random_search_results = {}

# ====== Start total timing ======
start_time_total = time.time()

for objective_function in objective_list:
    print(f"\n===== Searching for Objective: {objective_function} =====")

    # Randomly sample n_random_trials unique hyperparameter combinations
    sampled_combinations = random.sample(param_grid, n_random_trials)

    best_r2 = -np.inf
    best_rmse = np.inf
    best_params = None

    total_time_obj = 0  # <<< 加上每个objective单独的时间累计

    for trial_idx, (max_depth, learning_rate, n_estimators) in enumerate(sampled_combinations):
        start_time_trial = time.time()

        model = xgb.XGBRegressor(
            objective=objective_function,
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42,
            n_jobs=-1,
            tree_method='hist',
            subsample=0.8,
            colsample_bytree=0.8,
            early_stopping_rounds=early_stopping_rounds
        )

        # Train the model
        _ = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

        # Predict
        y_pred = model.predict(X_test)

        # Evaluate
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)

        end_time_trial = time.time()
        loop_time = end_time_trial - start_time_trial

        total_time_obj += loop_time  # <<< 每次加上loop_time到total_time_obj

        # Output trial result
        # print(f"Objective={objective_function} | Trial {trial_idx+1}/{n_random_trials} | max_depth={max_depth}, lr={learning_rate}, n_estimators={n_estimators} | RMSE={rmse:.4f}, R²={r2:.4f} | Loop time={loop_time:.2f} seconds")

        # Update best result if improved
        if r2 > best_r2:
            best_r2 = r2
            best_rmse = rmse
            best_params = (max_depth, learning_rate, n_estimators)

    # Save best result for this objective
    random_search_results[objective_function] = (best_r2, best_rmse, best_params)

    # 输出每个 objective 的总耗时
    print(f"\n>>> Total training time for {objective_function}: {total_time_obj:.2f} seconds")
    print(f">>> Best Parameters: max_depth={best_params[0]}, learning_rate={best_params[1]}, n_estimators={best_params[2]}")
    print(f">>> Best Avg R²: {best_r2:.4f}")

# ====== End total timing ======
end_time_total = time.time()
total_time = end_time_total - start_time_total

print(f"\n===== Total Random Search Time for all objectives: {total_time:.2f} seconds =====")


===== Searching for Objective: reg:squarederror =====

>>> Total training time for reg:squarederror: 7.08 seconds
>>> Best Parameters: max_depth=4, learning_rate=0.05, n_estimators=400
>>> Best Avg R²: 0.6921

===== Searching for Objective: reg:squaredlogerror =====

>>> Total training time for reg:squaredlogerror: 9.85 seconds
>>> Best Parameters: max_depth=4, learning_rate=0.2, n_estimators=100
>>> Best Avg R²: 0.6776

===== Total Random Search Time for all objectives: 16.93 seconds =====


## We realized decreasing running time from 113 seconds to 11.23 seconds. Now we will finally apply our model on unsampledd dataset.