# 6. Decision Trees and Ensemble Learning

In [56]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [57]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

In [58]:
!wget $data

--2025-11-04 03:34:02--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.2’


2025-11-04 03:34:02 (5.08 MB/s) - ‘car_fuel_efficiency.csv.2’ saved [874188/874188]



In [59]:
!head car_fuel_efficiency.csv

engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
170,3,159,3413.433758606219,17.7,2003,Europe,Gasoline,All-wheel drive,0,13.231728906241411
130,5,97,3149.6649342200353,17.8,2007,USA,Gasoline,Front-wheel drive,0,13.688217435463793
170,,78,3079.03899736884,15.1,2018,Europe,Gasoline,Front-wheel drive,0,14.246340998160866
220,4,,2542.392401828378,20.2,2009,USA,Diesel,All-wheel drive,2,16.91273559598635
210,1,140,3460.870989989018,14.4,2009,Europe,Gasoline,All-wheel drive,2,12.488369121964562
190,3,,2484.883986036068,14.7,2008,Europe,Gasoline,All-wheel drive,-1,17.271818372724237
240,7,127,3006.5422872171457,22.2,2012,USA,Gasoline,Front-wheel drive,1,13.210412112385608
150,4,239,3638.6577802809,17.3,2020,USA,Diesel,All-wheel drive,1,12.848883861524026
250,1,174,2714.219309645285,10.3,2016,Asia,Diesel,Front-wheel drive,-1,16.823553726916543


##Preparation:

* Fill missing values with zeros.
* Do train/validation/test split with 60%/20%/20% distribution.
* Use the train_test_split function and set the random_state parameter to 1.
* Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [60]:
df = pd.read_csv(data)

In [61]:
df['fuel_efficiency_mpg'].head()

Unnamed: 0,fuel_efficiency_mpg
0,13.231729
1,13.688217
2,14.246341
3,16.912736
4,12.488369


In [62]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)


In [63]:
categorical_columns

['origin', 'fuel_type', 'drivetrain']

In [64]:
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    print(df[c].value_counts())

origin
europe    3254
asia      3247
usa       3203
Name: count, dtype: int64
fuel_type
gasoline    4898
diesel      4806
Name: count, dtype: int64
drivetrain
all-wheel_drive      4876
front-wheel_drive    4828
Name: count, dtype: int64


In [65]:
numerical_columns

['engine_displacement',
 'num_cylinders',
 'horsepower',
 'vehicle_weight',
 'acceleration',
 'model_year',
 'num_doors',
 'fuel_efficiency_mpg']

In [66]:
for c in numerical_columns:
    df[c] = df[c].fillna(0.0)
df.isnull().sum()


Unnamed: 0,0
engine_displacement,0
num_cylinders,0
horsepower,0
vehicle_weight,0
acceleration,0
model_year,0
origin,0
fuel_type,0
drivetrain,0
num_doors,0


In [67]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [68]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [69]:
y_train =  df_train.fuel_efficiency_mpg.values
y_val =  df_val.fuel_efficiency_mpg.values
y_test =  df_test.fuel_efficiency_mpg.values

In [70]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [71]:

from sklearn.feature_extraction import DictVectorizer

In [72]:
train_dicts = df_train.fillna(0).to_dict(orient='records')
val_dicts = df_val.fillna(0).to_dict(orient='records')
test_dicts = df_test.fillna(0).to_dict(orient='records')

In [73]:
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
X_test = dv.transform(test_dicts)

## Q1 Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

Train a model with max_depth=1.

Which feature is used for splitting the data?

* 'vehicle_weight'
* 'model_year'
* 'origin'
* 'fuel_type'

In [74]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text

In [75]:
tree = DecisionTreeRegressor(max_depth=1, random_state=1)
tree.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [76]:
feature_names = dv.get_feature_names_out()
print(export_text(tree, feature_names=list(feature_names), max_depth=1))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



##Q2
Train a random forest regressor with these parameters:

n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)
What's the RMSE of this model on the validation data?

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [78]:
rf = RandomForestRegressor(random_state=1, n_estimators=10, n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [79]:
val_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_pred))
rmse

np.float64(0.45957772230927263)

# Question 3
Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.
Set random_state to 1.
Evaluate the model on the validation dataset.
After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

In [80]:
rmse_list = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(random_state=1, n_estimators=n, n_jobs=-1)
    rf.fit(X_train, y_train)
    val_pred = rf.predict(X_val)
    rmse = round(np.sqrt(mean_squared_error(y_val, val_pred)),3)
    rmse_list.append((n, round(rmse, 3)))
    print(n, rmse)
    if n > 10 and rmse_list[-1][1] >= rmse_list[-2][1]:
        break

10 0.46
20 0.454
30 0.452
40 0.449
50 0.447
60 0.445
70 0.445


# Question 4
Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]
For each of these values,
try different values of n_estimators from 10 till 200 (with step 10)
calculate the mean RMSE
Fix the random seed: random_state=1
What's the best max_depth, using the mean RMSE?



In [81]:
mean_rmse = {}
for d in [10, 15, 20, 25]:
  rmses = []
  for n in range(10, 201, 10):
    rf = RandomForestRegressor(random_state=1, n_estimators=n, n_jobs=-1, max_depth=d)
    rf.fit(X_train, y_train)
    val_pred = rf.predict(X_val)
    rmse = round(np.sqrt(mean_squared_error(y_val, val_pred)),3)
    rmses.append(rmse)
    if n > 10 and rmses[-1] >= rmses[-2]:
      break

  mean_rmse[d] = np.mean(rmses)

for d, rmse in mean_rmse.items():
  print(d, rmse)

print(min(mean_rmse, key=mean_rmse.get))


10 0.4448333333333334
15 0.44957142857142857
20 0.44975
25 0.449625
10


# Question 5
We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

Train the model with these parameters:
n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1 (optional)
Get the feature importance information from this model
What's the most important feature (among these 4)?

vehicle_weight
horsepower
acceleration
engine_displacement

In [82]:
rf = RandomForestRegressor(random_state=1, n_estimators=10, n_jobs=-1, max_depth=20)
rf.fit(X_train, y_train)

feature_names = dv.get_feature_names_out()
feature_importances = pd.Series(rf.feature_importances_, index=feature_names)
feature_importances.sort_values(ascending=False)

Unnamed: 0,0
vehicle_weight,0.95915
horsepower,0.015998
acceleration,0.01148
engine_displacement,0.003273
model_year,0.003212
num_cylinders,0.002343
num_doors,0.001635
origin=usa,0.00054
origin=europe,0.000519
origin=asia,0.000462


# Question 6
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

Install XGBoost
Create DMatrix for train and validation
Create a watchlist
Train a model with these parameters for 100 rounds:
~~~
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
~~~
Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?


In [83]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [84]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [85]:
def run_xgb(eta):

  xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
  }
  watchlist = [(dtrain, "train"), (dval, "val")]
  evals_result = {}
  booster = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=100,
    evals=watchlist,
    evals_result=evals_result,
    verbose_eval=False,
  )

  y_pred = booster.predict(dval)
  rmse = np.sqrt(mean_squared_error(y_val, y_pred))

  val_hist = evals_result["val"]["rmse"]
  best_rmse = float(min(val_hist))
  best_iter = int(np.argmin(val_hist) + 1)
  return booster, rmse, best_rmse, best_iter

In [86]:
boostr_03, rmse_03, best_03, best_iter03 = run_xgb(0.3)
print(f"eta=0.3, RMSE={rmse_03}, best_rmse={best_03}, best_iter={best_iter03}")

eta=0.3, RMSE=0.45017755678087246, best_rmse=0.4334861295405598, best_iter=15


In [87]:
boostr_01, rmse_01, best_01, best_iter01 = run_xgb(0.1)
print(f"eta=0.1, RMSE={rmse_01}, best_rmse={best_01}, best_iter={best_iter01}")

eta=0.1, RMSE=0.45017755678087246, best_rmse=0.4334861295405598, best_iter=15
