In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

In [3]:
data.head(1)

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729


In [4]:
data.isna().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [5]:
data.fillna(0, inplace=True)

In [7]:
# First split: train (60%) and temp (40%)
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=1)

# Second split: val (20%) and test (20%) from temp (which is 40% of original)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=1)

# Check the sizes
print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Train: 5822, Val: 1941, Test: 1941


In [8]:
# Remove the target column from each split to get features
X_train = train_data.drop(columns=['fuel_efficiency_mpg'])
X_val = val_data.drop(columns=['fuel_efficiency_mpg'])
X_test = test_data.drop(columns=['fuel_efficiency_mpg'])

# Extract the target column from each split
y_train = train_data['fuel_efficiency_mpg']
y_val = val_data['fuel_efficiency_mpg']
y_test = test_data['fuel_efficiency_mpg']

In [10]:
from sklearn.feature_extraction import DictVectorizer

In [11]:
# Initialize the DictVectorizer
dv = DictVectorizer(sparse=True)

# Convert dataframes to dictionaries and transform
train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

# Fit and transform the training data
X_train_matrix = dv.fit_transform(train_dict)

# Transform validation and test data using the fitted vectorizer
X_val_matrix = dv.transform(val_dict)
X_test_matrix = dv.transform(test_dict)

# Print the shapes to verify
print(f"Training matrix shape: {X_train_matrix.shape}")
print(f"Validation matrix shape: {X_val_matrix.shape}")
print(f"Test matrix shape: {X_test_matrix.shape}")

Training matrix shape: (5822, 14)
Validation matrix shape: (1941, 14)
Test matrix shape: (1941, 14)


### Q1

In [12]:
from sklearn.tree import DecisionTreeRegressor

In [13]:
# Initialize the Decision Tree Regressor
dt_reg = DecisionTreeRegressor(max_depth=1, random_state=1)

In [14]:
dt_reg.fit(X_train_matrix, y_train)

In [15]:
# Get feature names from DictVectorizer
feature_names = dv.get_feature_names_out()

# Get feature importances from the model
importances = dt_reg.feature_importances_

# Create a dictionary of feature names and their importance scores
feature_importance_dict = dict(zip(feature_names, importances))

# Sort features by importance and print
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the features and their importance scores
for feature, importance in sorted_features:
    if importance > 0:  # Only print features that were actually used
        print(f"Feature: {feature}, Importance: {importance:.4f}")

Feature: vehicle_weight, Importance: 1.0000


### Q2

Train a random forest regressor with these parameters:

n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

In [19]:
rf.fit(X_train_matrix, y_train)

In [32]:
from sklearn.metrics import root_mean_squared_error

In [28]:
y_val_pred = rf.predict(X_val_matrix)

In [33]:
rmse = root_mean_squared_error(y_val, y_val_pred)
print(f"RMSE: {rmse:.4f}")

RMSE: 0.4603


### Q3

In [35]:
import numpy as np

In [37]:
n_estimators = np.arange(10, 201, 10)
n_estimators

array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190, 200])

In [38]:
for val in n_estimators:
    rf = RandomForestRegressor(n_estimators=val, random_state=1, n_jobs=-1)
    rf.fit(X_train_matrix, y_train)
    y_val_pred = rf.predict(X_val_matrix)
    rmse = root_mean_squared_error(y_val, y_val_pred)
    print(f"n_estimators: {val}, RMSE: {rmse:.4f}")

n_estimators: 10, RMSE: 0.4603
n_estimators: 20, RMSE: 0.4462
n_estimators: 30, RMSE: 0.4398
n_estimators: 40, RMSE: 0.4384
n_estimators: 50, RMSE: 0.4372
n_estimators: 60, RMSE: 0.4356
n_estimators: 70, RMSE: 0.4361
n_estimators: 80, RMSE: 0.4361
n_estimators: 90, RMSE: 0.4354
n_estimators: 100, RMSE: 0.4353
n_estimators: 110, RMSE: 0.4349
n_estimators: 120, RMSE: 0.4355
n_estimators: 130, RMSE: 0.4349
n_estimators: 140, RMSE: 0.4351
n_estimators: 150, RMSE: 0.4352
n_estimators: 160, RMSE: 0.4352
n_estimators: 170, RMSE: 0.4352
n_estimators: 180, RMSE: 0.4352
n_estimators: 190, RMSE: 0.4354
n_estimators: 200, RMSE: 0.4350


### Q4

Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]
For each of these values,
try different values of n_estimators from 10 till 200 (with step 10)
calculate the mean RMSE
Fix the random seed: random_state=1
What's the best max_depth, using the mean RMSE?

10
15
20
25

In [39]:
max_depth = np.arange(10, 26, 5)
max_depth

array([10, 15, 20, 25])

In [None]:
for val_depth in max_depth:
    rmses = []
    for val_est in n_estimators:
        rf = RandomForestRegressor(n_estimators=val_est, max_depth=val_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train_matrix, y_train)
        y_val_pred = rf.predict(X_val_matrix)
        rmse = root_mean_squared_error(y_val, y_val_pred)
        rmses.append(rmse)
        print(f"max_depth: {val_depth}, n_estimators: {val_est}, RMSE: {rmse:.4f}")
    print(f"max_depth: {val_depth}, MEAN RMSEs: {np.mean(rmses):.4f}")

max_depth: 10, n_estimators: 10, RMSE: 0.4515
max_depth: 10, n_estimators: 20, RMSE: 0.4426
max_depth: 10, n_estimators: 30, RMSE: 0.4384
max_depth: 10, n_estimators: 40, RMSE: 0.4372
max_depth: 10, n_estimators: 50, RMSE: 0.4363
max_depth: 10, n_estimators: 60, RMSE: 0.4352
max_depth: 10, n_estimators: 70, RMSE: 0.4351
max_depth: 10, n_estimators: 80, RMSE: 0.4351
max_depth: 10, n_estimators: 90, RMSE: 0.4349
max_depth: 10, n_estimators: 100, RMSE: 0.4346
max_depth: 10, n_estimators: 110, RMSE: 0.4342
max_depth: 10, n_estimators: 120, RMSE: 0.4344
max_depth: 10, n_estimators: 130, RMSE: 0.4343
max_depth: 10, n_estimators: 140, RMSE: 0.4344
max_depth: 10, n_estimators: 150, RMSE: 0.4346
max_depth: 10, n_estimators: 160, RMSE: 0.4346
max_depth: 10, n_estimators: 170, RMSE: 0.4345
max_depth: 10, n_estimators: 180, RMSE: 0.4345
max_depth: 10, n_estimators: 190, RMSE: 0.4344
max_depth: 10, n_estimators: 200, RMSE: 0.4342
max_depth: 10, MEAN RMSEs: 0.4362
max_depth: 15, n_estimators: 10, RM