In [5]:
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# Load your preprocessed training data
X_train = pd.read_csv('X_train.csv')

# Load the original training data to get the target variable 'SalePrice'
dataset = pd.read_csv('train.csv')
y_train = dataset['SalePrice']

# Split data into training and testing sets for evaluation
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.1, random_state=0)

print("Shape of X_train_split:", X_train_split.shape)
print("Shape of X_test_split:", X_test_split.shape)

## --- Step 1: Initialize and Train the XGBoost Regressor ---

# Create an XGBoost Regressor object
regressor = xgboost.XGBRegressor()

# Train the model on the split training data
regressor.fit(X_train_split, y_train_split)

# Make predictions on the test set
y_pred = regressor.predict(X_test_split)

## --- Step 2: Evaluate the Initial Model ---

print("\n--- Initial Model Evaluation ---")
# Calculate R-squared score
r2 = r2_score(y_test_split, y_pred)
print(f"R-squared: {r2:.4f}")

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test_split, y_pred))
print(f"RMSE: {rmse:.4f}")


## --- Step 3: Hyperparameter Tuning with RandomizedSearchCV ---

# Define the hyperparameter grid to search
# These are some of the most important parameters for XGBoost
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

# Set up RandomizedSearchCV
# n_iter controls how many different combinations to try.
# cv is the number of cross-validation folds.
random_search = RandomizedSearchCV(
    regressor,
    param_distributions=params,
    n_iter=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1, # Use all available CPU cores
    cv=5,
    verbose=3
)

# Fit the random search model to find the best hyperparameters
random_search.fit(X_train, y_train)

# Print the best combination of hyperparameters found
print("\n--- Hyperparameter Tuning Results ---")
print("Best Estimator:", random_search.best_estimator_)
print("Best Parameters:", random_search.best_params_)


## --- Step 4: Train and Evaluate the Final Model ---

# The best estimator is already trained on the full data during the search,
# but you can explicitly create and train it if you wish.
final_model = random_search.best_estimator_
final_model.fit(X_train_split, y_train_split) # Re-fit on the split for evaluation

# Make predictions with the tuned model
final_y_pred = final_model.predict(X_test_split)

print("\n--- Final Tuned Model Evaluation ---")
# Evaluate the tuned model
final_r2 = r2_score(y_test_split, final_y_pred)
print(f"Final R-squared: {final_r2:.4f}")

final_rmse = np.sqrt(mean_squared_error(y_test_split, final_y_pred))
print(f"Final RMSE: {final_rmse:.4f}")



Shape of X_train_split: (1314, 84)
Shape of X_test_split: (146, 84)

--- Initial Model Evaluation ---
R-squared: 0.9713
RMSE: 14032.1431
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.2, max_depth=5, min_child_weight=5;, score=-12223.858 total time=   0.1s
[CV 2/5] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.2, max_depth=5, min_child_weight=5;, score=-15387.705 total time=   0.1s
[CV 3/5] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.2, max_depth=5, min_child_weight=5;, score=-14691.950 total time=   0.1s
[CV 5/5] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.2, max_depth=5, min_child_weight=5;, score=-17456.111 total time=   0.1s
[CV 4/5] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.2, max_depth=5, min_child_weight=5;, score=-10126.572 total time=   0.1s
[CV 3/5] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.2, max_depth=5, min_child_weight=3;, score=-15749.717 total time= 

In [10]:
# --- Calculate the mean of the actual house prices ---
mean_price = y_true.mean()
print(f"Average House Price: ${mean_price:,.2f}")

# --- Calculate the Normalized RMSE (as a percentage) ---
nrmse = (rmse / mean_price) * 100

print(f"Your RMSE of ${rmse:,.2f} is {nrmse:.2f}% of the average house price.")

Average House Price: $180,921.20
Your RMSE of $12,388.04 is 6.85% of the average house price.


Exception ignored in: <function ResourceTracker.__del__ at 0x107939bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x10831dbc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x110fa5bc0>
Traceback (most recent call last