# Gradient Boosting Example

**Source (all credits to):** *Hands-On Machine Learning - Aurélien Géron*

# 1. Import Libraries

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor #pip3 install xgboost
from sklearn.metrics import mean_absolute_error

# 2. Prepare Data

In [19]:
# Load the data
data = pd.read_csv('../datasets/melb_data.csv')

# Select subset of predictors
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data.Price

# Separate data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

# 3. Modeling

In [20]:
my_model = XGBRegressor()
my_model.fit(X_train, y_train)

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 243861.17887748528


# 4. Parameter Tuning

In [21]:
# n_estimators = 500
my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train)

# early_stopping_rounds = 5 (find the optimal time to stop iterating)
my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)],
             verbose=False)

# learning_rate = 0.05
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

# n_jobs = 4 (use parallelism - use ~equal to the number of cores on your machine)
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 241365.6726159794
