In [1]:
import urllib.request
urllib.request.urlretrieve('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv', 'car_fuel_efficiency.csv')

('car_fuel_efficiency.csv', <http.client.HTTPMessage at 0x127d5d0d890>)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [3]:
df = pd.read_csv("car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
df = df.fillna(0)

In [5]:
num_cols = df.select_dtypes(include=['number']).columns.difference(['fuel_efficiency_mpg']).tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
target = 'fuel_efficiency_mpg'

In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [7]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[cat_cols + num_cols].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val[cat_cols + num_cols].to_dict(orient='records')
X_val = dv.transform(val_dict)
test_dict = df_test[cat_cols + num_cols].to_dict(orient='records')
X_test = dv.transform(test_dict)

## Q1

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
dt = DecisionTreeRegressor(max_depth=1)

dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [9]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



## Q2

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
rf_pred = rf.predict(X_val)
print(root_mean_squared_error(y_val, rf_pred))

0.4599777557336149


## Q3

In [15]:
n_estimators_range  = np.arange(10, 201, 10)
rmse_scores = {}
for n in n_estimators_range:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = round(root_mean_squared_error(y_val, y_pred), 3)
    rmse_scores[n] = rmse

print("\n--- RMSE Scores (rounded to 3 decimals) ---")
for n, rmse in rmse_scores.items():
    print(f"n_estimators: {n:3}, RMSE: {rmse:.3f}")

# 5. Find the minimum RMSE
min_rmse = min(rmse_scores.values())
best_n = [n for n, rmse in rmse_scores.items() if rmse == min_rmse]

print(f"\nMinimum RMSE found: {min_rmse:.3f}")
print(f"Achieved at n_estimators: {best_n}")


--- RMSE Scores (rounded to 3 decimals) ---
n_estimators:  10, RMSE: 0.460
n_estimators:  20, RMSE: 0.454
n_estimators:  30, RMSE: 0.451
n_estimators:  40, RMSE: 0.448
n_estimators:  50, RMSE: 0.446
n_estimators:  60, RMSE: 0.445
n_estimators:  70, RMSE: 0.445
n_estimators:  80, RMSE: 0.445
n_estimators:  90, RMSE: 0.445
n_estimators: 100, RMSE: 0.444
n_estimators: 110, RMSE: 0.443
n_estimators: 120, RMSE: 0.444
n_estimators: 130, RMSE: 0.443
n_estimators: 140, RMSE: 0.443
n_estimators: 150, RMSE: 0.443
n_estimators: 160, RMSE: 0.443
n_estimators: 170, RMSE: 0.443
n_estimators: 180, RMSE: 0.442
n_estimators: 190, RMSE: 0.443
n_estimators: 200, RMSE: 0.443

Minimum RMSE found: 0.442
Achieved at n_estimators: [np.int64(180)]


the RMSE stopped improving after `80`

## Q4

In [17]:
depth = [10, 15, 20, 25]
mean_rmse_scores = {}

for d in depth:
    depth_score = []
    for n in n_estimators_range:
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        depth_score.append(rmse)
    mean_rmse = np.mean(depth_score)
    mean_rmse_scores[d] = mean_rmse

print("--- Final Results (Mean RMSE) ---")
for md, mean_rmse in mean_rmse_scores.items():
    print(f"max_depth: {md}, Mean RMSE: {mean_rmse:.4f}")

# Find the max_depth with the minimum mean RMSE
best_depth = min(mean_rmse_scores, key=mean_rmse_scores.get)

print(f"\n🏆 Best max_depth: {best_depth}")

--- Final Results (Mean RMSE) ---
max_depth: 10, Mean RMSE: 0.4423
max_depth: 15, Mean RMSE: 0.4451
max_depth: 20, Mean RMSE: 0.4456
max_depth: 25, Mean RMSE: 0.4457

🏆 Best max_depth: 10


In [19]:
model = RandomForestRegressor(n_estimators=10, 
                              max_depth=20,
                              random_state=1,
                              n_jobs=-1)

model.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
feature_names = dv.get_feature_names_out()
importances = model.feature_importances_

fi_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
fi_df = fi_df.sort_values('importance', ascending=False).reset_index(drop=True)

# print all features sorted by importance
print(fi_df.to_string(index=False))

                     feature  importance
              vehicle_weight    0.959162
                  horsepower    0.016040
                acceleration    0.011471
         engine_displacement    0.003269
                  model_year    0.003182
               num_cylinders    0.002359
                   num_doors    0.001591
                  origin=USA    0.000555
               origin=Europe    0.000520
                 origin=Asia    0.000476
  drivetrain=All-wheel drive    0.000382
            fuel_type=Diesel    0.000344
          fuel_type=Gasoline    0.000337
drivetrain=Front-wheel drive    0.000312


`vehicle_weight`

## Q6

In [23]:
import xgboost as xgb

features = list(dv.get_feature_names_out())

dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [25]:
y_perd = model.predict(dval)
print(root_mean_squared_error(y_val, y_pred))

0.44266033570440483


In [26]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [27]:
y_perd = model.predict(dval)
print(root_mean_squared_error(y_val, y_pred))

0.44266033570440483
