In [24]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [25]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [26]:
df = df.fillna(0)
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [27]:
# Q1 - Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

# Define target and features
target = 'fuel_efficiency_mpg'
features = ['vehicle_weight', 'model_year', 'origin', 'fuel_type']

X = df[features]
y = df[target]

In [28]:
# Split data into train/validation/test (60/20/20)
X_train_full, X_temp, y_train_full, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=1
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

In [29]:
# Convert to dictionaries and use DictVectorizer
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train_full.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))

In [30]:
# Train Decision Tree Regressor with max_depth=1
tree = DecisionTreeRegressor(max_depth=1, random_state=1)
tree.fit(X_train, y_train_full)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [31]:
# Print the feature used for splitting
feature_name = dv.get_feature_names_out()[tree.tree_.feature[0]]
print("Feature used for splitting:", feature_name)

Feature used for splitting: vehicle_weight


In [32]:
# Q2 - What's the RMSE of this model on the validation data?

# Train Random Forest Regressor
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train_full)

# Predict on validation data
y_pred = rf.predict(X_val)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE on validation data:", rmse)

RMSE on validation data: 0.620275759263619


In [None]:
# Q3 - Experiment with different n_estimators
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train_full)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"n_estimators={n:3d} --> RMSE={rmse:.3f}")

In [33]:
# Q4 - The max_depth values
depth_values = [10, 15, 20, 25]
results = {}

for depth in depth_values:
    rmses = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train_full)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmses.append(rmse)
    mean_rmse = np.mean(rmses)
    results[depth] = mean_rmse
    print(f"max_depth={depth:2d} --> mean RMSE={mean_rmse:.3f}")

# Find the best max_depth
best_depth = min(results, key=results.get)
print("\nBest max_depth:", best_depth)

max_depth=10 --> mean RMSE=0.573
max_depth=15 --> mean RMSE=0.597
max_depth=20 --> mean RMSE=0.602
max_depth=25 --> mean RMSE=0.602

Best max_depth: 10


In [34]:
# Q5 - The most important feature

# Train Random Forest
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train_full)

# Get feature importances
importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

for name, importance in zip(feature_names, importances):
    print(f"{name:25s} {importance:.4f}")

# Find the most important feature
most_important = feature_names[importances.argmax()]
print("\nMost important feature:", most_important)

fuel_type=Diesel          0.0014
fuel_type=Gasoline        0.0012
model_year                0.0128
origin=Asia               0.0015
origin=Europe             0.0016
origin=USA                0.0016
vehicle_weight            0.9799

Most important feature: vehicle_weight


In [37]:
# Q6 - XGBoost eta

!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.1.1-py3-none-manylinux_2_28_x86_64.whl (115.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.28.7-py3-none-manylinux_2_18_x86_64.whl (296.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.8/296.8 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [xgboost]━━━[0m [32m1/2[0m [xgboost]
[1A[2KSuccessfully installed nvidia-nccl-cu12-2.28.7 xgboost-3.1.1

[1m[[0

In [44]:
import xgboost as xgb

feature_names = list(dv.get_feature_names_out())

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train_full, feature_names=feature_names)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_names)

# Define watchlist
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Train model with eta = 0.3
xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 0
}
model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))

# Train model with eta = 0.1
xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1
model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))

print(f"RMSE (eta=0.3): {rmse_03:.3f}")
print(f"RMSE (eta=0.1): {rmse_01:.3f}")

RMSE (eta=0.3): 0.597
RMSE (eta=0.1): 0.566
