# Step 1: Import Libraries

In [47]:
import numpy as np
import pandas as pd


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb

# optional: suppress warnings
import warnings
warnings.filterwarnings("ignore")



## Step 2: Data Preparation

In [48]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

# Fill missing values with zeros
df = df.fillna(0)

# Define target
y = df.fuel_efficiency_mpg

# Define features
X = df.drop(columns=['fuel_efficiency_mpg'])

# Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

# Convert to dicts for DictVectorizer
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

# Vectorize
dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(train_dicts)
X_val_dv = dv.transform(val_dicts)

print("Data prepared. Train shape:", X_train_dv.shape, "Validation shape:", X_val_dv.shape)


Data prepared. Train shape: (5822, 14) Validation shape: (1941, 14)


## Step 3: Train Decision Tree Model

In [49]:
rf = RandomForestRegressor(n_estimators=25, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

y_pred = rf.predict(X_val_dv)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE on validation: {rmse:.3f}")


RMSE on validation: 0.442


## Step 4: Train Random Forest and Track RMSE

In [51]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_dv, y_train)
    y_pred = rf.predict(X_val_dv)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))

scores_df = pd.DataFrame(scores, columns=['n_estimators', 'RMSE'])
display(scores_df)


Unnamed: 0,n_estimators,RMSE
0,10,0.460282
1,20,0.446157
2,30,0.439778
3,40,0.438394
4,50,0.43717
5,60,0.435591
6,70,0.436112
7,80,0.436055
8,90,0.43541
9,100,0.435277


## Identifying the best depth value

In [52]:
depth_values = [10, 15, 20, 25]
depth_scores = []

for d in depth_values:
    rmse_list = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(max_depth=d, n_estimators=n, random_state=1, n_jobs=-1)
        rf.fit(X_train_dv, y_train)
        y_pred = rf.predict(X_val_dv)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    mean_rmse = np.mean(rmse_list)
    depth_scores.append((d, mean_rmse))

depth_df = pd.DataFrame(depth_scores, columns=['max_depth', 'mean_RMSE'])
display(depth_df)


Unnamed: 0,max_depth,mean_RMSE
0,10,0.436247
1,15,0.437825
2,20,0.437693
3,25,0.437653


## Step 5: Feature Importance and Model Evaluation

In [None]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

importance = pd.Series(rf.feature_importances_, index=dv.feature_names_)
importance = importance.sort_values(ascending=False)
display(importance.head(10))


vehicle_weight         0.959878
horsepower             0.015933
acceleration           0.011442
engine_displacement    0.003159
model_year             0.003066
num_cylinders          0.002323
num_doors              0.001576
origin=USA             0.000496
origin=Asia            0.000431
origin=Europe          0.000419
dtype: float64

## Step 6: Train XGBoost Model

In [53]:
features = X_train.columns.tolist()

X_train_encoded = pd.get_dummies(X_train, columns=['origin', 'fuel_type', 'drivetrain'])
X_val_encoded = pd.get_dummies(X_val, columns=['origin', 'fuel_type', 'drivetrain'])

dtrain = xgb.DMatrix(X_train_encoded, label=y_train)
dval = xgb.DMatrix(X_val_encoded, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1

print("Training with eta=0.3")
model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

print("Training with eta=0.1")
model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

y_pred_03 = model_03.predict(dval)
y_pred_01 = model_01.predict(dval)

rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))

print(f"RMSE eta=0.3: {rmse_03:.3f}")
print(f"RMSE eta=0.1: {rmse_01:.3f}")


Training with eta=0.3
Training with eta=0.1
RMSE eta=0.3: 0.443
RMSE eta=0.1: 0.417
