## Homework CH06

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(url)

In [3]:
df_filled = df.fillna(0)

In [4]:
from sklearn.model_selection import train_test_split

X = df_filled.drop('fuel_efficiency_mpg', axis=1)
y = df_filled['fuel_efficiency_mpg']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=1
)

In [5]:
dv = DictVectorizer(sparse=True)

train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)
X_test_encoded = dv.transform(test_dict)

## Question 1

In [7]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_encoded, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [8]:
feature_names = dv.get_feature_names_out()
split_feature_index = dt.tree_.feature[0]
split_feature_name = feature_names[split_feature_index]
print("Feature used for the split:", split_feature_name)

Feature used for the split: vehicle_weight


#### Answer : vehicle_weight

## Question 2

In [9]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train_encoded, y_train)
y_val_pred = rf.predict(X_val_encoded)

mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)

In [10]:
print(rmse)

0.4595777223092726


#### Answer : 0.046

## Question 3

In [16]:
n_estimators_range = range(10, 201, 10)
rmse_scores = []
best_rmse = float('inf')
stop_improving_at = None
improvement_threshold = 0.001

In [17]:
for n_est in n_estimators_range:
    # Train Random Forest Regressor
    rf = RandomForestRegressor(
        n_estimators=n_est,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train_encoded, y_train)
    
    # Make predictions on validation set
    y_val_pred = rf.predict(X_val_encoded)
    
    # Calculate RMSE
    mse = mean_squared_error(y_val, y_val_pred)
    rmse = np.sqrt(mse)
    rmse_scores.append(rmse)

    improvement = best_rmse - rmse
    if improvement > improvement_threshold:
        best_rmse = rmse
        stop_improving_at = n_est
    elif stop_improving_at is None:
        best_rmse = rmse
        stop_improving_at = n_est

In [18]:
print(f"n_estimators: {n_est:3d} | RMSE: {rmse:.6f} | Improvement: {improvement:.6f}")

print("\n" + "=" * 50)
print(f"RMSE stopped improving significantly after n_estimators = {stop_improving_at}")

n_estimators: 200 | RMSE: 0.442479 | Improvement: -0.000117

RMSE stopped improving significantly after n_estimators = 180


In [19]:
stop_improving_at = n_estimators_range[0]  # Default to first value

for i in range(1, len(rmse_scores)):
    improvement = rmse_scores[i-1] - rmse_scores[i]
    print(f"{n_estimators_range[i-1]:3d} -> {n_estimators_range[i]:3d}: RMSE improvement = {improvement:.6f}")
    
    if abs(improvement) < improvement_threshold:
        if stop_improving_at == n_estimators_range[0]:  # Only set the first time
            stop_improving_at = n_estimators_range[i-1]
            print(f"  ^ FIRST TIME: Improvement < {improvement_threshold}, RMSE effectively stopped improving after {n_estimators_range[i-1]}")

print(f"\nFinal Answer: RMSE stops improving significantly after n_estimators = {stop_improving_at}")

 10 ->  20: RMSE improvement = 0.005987
 20 ->  30: RMSE improvement = 0.001904
 30 ->  40: RMSE improvement = 0.002966
 40 ->  50: RMSE improvement = 0.002064
 50 ->  60: RMSE improvement = 0.001197
 60 ->  70: RMSE improvement = 0.000333
  ^ FIRST TIME: Improvement < 0.001, RMSE effectively stopped improving after 60
 70 ->  80: RMSE improvement = 0.000142
 80 ->  90: RMSE improvement = 0.000123
 90 -> 100: RMSE improvement = 0.000210
100 -> 110: RMSE improvement = 0.001073
110 -> 120: RMSE improvement = -0.000333
120 -> 130: RMSE improvement = 0.000209
130 -> 140: RMSE improvement = 0.000348
140 -> 150: RMSE improvement = 0.000457
150 -> 160: RMSE improvement = 0.000136
160 -> 170: RMSE improvement = -0.000040
170 -> 180: RMSE improvement = 0.000440
180 -> 190: RMSE improvement = -0.000132
190 -> 200: RMSE improvement = 0.000015

Final Answer: RMSE stops improving significantly after n_estimators = 60


#### Answer : 80

## Question 4

In [20]:
max_depth_values = [10, 15, 20, 25]
n_estimators_list = list(range(10, 201, 10))

results = {}

In [21]:
for max_depth in max_depth_values:
    rmse_scores = []
    
    print(f"\nmax_depth = {max_depth}:")
    print("-" * 30)
    
    for n_est in n_estimators_list:
        # Train Random Forest Regressor with specific max_depth
        rf = RandomForestRegressor(
            n_estimators=n_est,
            max_depth=max_depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_encoded, y_train)
        
        # Make predictions on validation set
        y_val_pred = rf.predict(X_val_encoded)
        
        # Calculate RMSE
        mse = mean_squared_error(y_val, y_val_pred)
        rmse = np.sqrt(mse)
        rmse_scores.append(rmse)
        
        if n_est % 50 == 0:  # Print every 50 estimators to track progress
            print(f"  n_estimators: {n_est:3d} | RMSE: {rmse:.6f}")
    
    # Calculate mean RMSE for this max_depth
    mean_rmse = np.mean(rmse_scores)
    results[max_depth] = mean_rmse
    
    print(f"Mean RMSE for max_depth {max_depth}: {mean_rmse:.6f}")


max_depth = 10:
------------------------------
  n_estimators:  50 | RMSE: 0.441957
  n_estimators: 100 | RMSE: 0.441217
  n_estimators: 150 | RMSE: 0.439943
  n_estimators: 200 | RMSE: 0.439845
Mean RMSE for max_depth 10: 0.441808

max_depth = 15:
------------------------------
  n_estimators:  50 | RMSE: 0.446318
  n_estimators: 100 | RMSE: 0.444341
  n_estimators: 150 | RMSE: 0.442732
  n_estimators: 200 | RMSE: 0.442346
Mean RMSE for max_depth 15: 0.445417

max_depth = 20:
------------------------------
  n_estimators:  50 | RMSE: 0.447312
  n_estimators: 100 | RMSE: 0.445307
  n_estimators: 150 | RMSE: 0.443308
  n_estimators: 200 | RMSE: 0.442905
Mean RMSE for max_depth 20: 0.446253

max_depth = 25:
------------------------------
  n_estimators:  50 | RMSE: 0.446901
  n_estimators: 100 | RMSE: 0.444778
  n_estimators: 150 | RMSE: 0.442937
  n_estimators: 200 | RMSE: 0.442479
Mean RMSE for max_depth 25: 0.445910


In [22]:
# Find the best max_depth (lowest mean RMSE)
best_max_depth = min(results, key=results.get)
best_mean_rmse = results[best_max_depth]

print(f"Best max_depth: {best_max_depth}")
print(f"Best mean RMSE: {best_mean_rmse:.6f}")

Best max_depth: 10
Best mean RMSE: 0.441808


#### Answer : 10

## Question 5

In [23]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train_encoded, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
feature_names = dv.get_feature_names_out()
feature_importances = rf.feature_importances_

In [25]:
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print("All feature importances:")
print("=" * 50)
for i, row in importance_df.iterrows():
    print(f"{row['feature']:30} : {row['importance']:.6f}")

All feature importances:
vehicle_weight                 : 0.959150
horsepower                     : 0.015998
acceleration                   : 0.011480
engine_displacement            : 0.003273
model_year                     : 0.003212
num_cylinders                  : 0.002343
num_doors                      : 0.001635
origin=USA                     : 0.000540
origin=Europe                  : 0.000519
origin=Asia                    : 0.000462
fuel_type=Gasoline             : 0.000360
drivetrain=All-wheel drive     : 0.000357
drivetrain=Front-wheel drive   : 0.000345
fuel_type=Diesel               : 0.000325


#### Answer : vehicle_weight

## Question 6

In [26]:
feature_names = dv.get_feature_names_out().tolist()

In [27]:
dtrain = xgb.DMatrix(X_train_encoded, label=y_train, feature_names=feature_names)
dval = xgb.DMatrix(X_val_encoded, label=y_val, feature_names=feature_names)

In [28]:
watchlist = [(dtrain, 'train'), (dval, 'val')]
eta_values = [0.3, 0.1]
results = {}

In [29]:
print("Comparing XGBoost with different eta values:")
print("=" * 50)

for eta in eta_values:
    xgb_params = {
        'eta': eta, 
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1,
    }
    
    print(f"\nTraining with eta = {eta}")
    print("-" * 30)
    
    # Train model for 100 rounds
    model = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=100,
        evals=watchlist,
        verbose_eval=False
    )
    
    # Make predictions on validation set
    y_val_pred = model.predict(dval)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    results[eta] = rmse
    
    print(f"Validation RMSE with eta={eta}: {rmse:.6f}")

Comparing XGBoost with different eta values:

Training with eta = 0.3
------------------------------
Validation RMSE with eta=0.3: 0.450178

Training with eta = 0.1
------------------------------
Validation RMSE with eta=0.1: 0.426228


In [30]:
print("\n" + "=" * 50)
print("FINAL COMPARISON:")
print("=" * 50)
for eta, rmse in results.items():
    print(f"eta = {eta}: Validation RMSE = {rmse:.6f}")



FINAL COMPARISON:
eta = 0.3: Validation RMSE = 0.450178
eta = 0.1: Validation RMSE = 0.426228


In [31]:
# Determine which eta is better
best_eta = min(results, key=results.get)
best_rmse = results[best_eta]

print(f"\nBest eta: {best_eta} (RMSE: {best_rmse:.6f})")

if abs(results[0.3] - results[0.1]) < 0.0001:  
    print("Both eta values give equal RMSE")
else:
    print(f"eta = {best_eta} leads to the best RMSE score")



Best eta: 0.1 (RMSE: 0.426228)
eta = 0.1 leads to the best RMSE score


#### Answer : 0.1