In [54]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [55]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

!wget $data

In [56]:
df = pd.read_csv(data)

In [57]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [58]:
df.columns = df.columns.str.lower()

In [59]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [60]:
categorical = ['origin', 'fuel_type', 'drivetrain']
numerical = ['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight', 'acceleration', 'model_year', 'num_doors', 'fuel_efficiency_mpg']

In [61]:
for n in numerical:
    df[n] = df[n].fillna(0.0)

In [62]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [63]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [64]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [66]:
y_train = df_train.fuel_efficiency_mpg
y_val   = df_val.fuel_efficiency_mpg
y_test  = df_test.fuel_efficiency_mpg

In [67]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [68]:
train_dicts = df_train.to_dict(orient='records')

In [None]:
# Question 1
# Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

# Train a model with max_depth=1.
# Which feature is used for splitting the data?

In [69]:
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)

In [70]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [71]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [72]:
feature_importances = dt.feature_importances_
feature_names = list(dv.get_feature_names_out())

In [73]:
feature_names

['acceleration',
 'drivetrain=All-wheel drive',
 'drivetrain=Front-wheel drive',
 'engine_displacement',
 'fuel_type=Diesel',
 'fuel_type=Gasoline',
 'horsepower',
 'model_year',
 'num_cylinders',
 'num_doors',
 'origin=Asia',
 'origin=Europe',
 'origin=USA',
 'vehicle_weight']

In [74]:
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

                         Feature  Importance
13                vehicle_weight         1.0
0                   acceleration         0.0
2   drivetrain=Front-wheel drive         0.0
3            engine_displacement         0.0
4               fuel_type=Diesel         0.0
1     drivetrain=All-wheel drive         0.0
5             fuel_type=Gasoline         0.0
6                     horsepower         0.0
8                  num_cylinders         0.0
7                     model_year         0.0
9                      num_doors         0.0
10                   origin=Asia         0.0
11                 origin=Europe         0.0
12                    origin=USA         0.0


In [None]:
# Question 2
# Train a random forest regressor with these parameters:

# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)

In [80]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [76]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_regressor.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [78]:
y_pred = rf_regressor.predict(X_val)

In [81]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE for Random Forest Regressor: {rmse}")

RMSE for Random Forest Regressor: 0.45957772230927263


In [None]:
# Question 3
# Now let's experiment with the n_estimators parameter

# Try different values of this parameter from 10 to 200 with step 10.
# Set random_state to 1.
# Evaluate the model on the validation dataset.
# After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

In [83]:
for n in range(10, 201, 10):
    rf_regressor = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rounded_rmse = np.round(rmse, decimals=3)
    print(f"n_estimator={n}, RMSE for Random Forest Regressor: {rounded_rmse}")

n_estimator=10, RMSE for Random Forest Regressor: 0.46
n_estimator=20, RMSE for Random Forest Regressor: 0.454
n_estimator=30, RMSE for Random Forest Regressor: 0.452
n_estimator=40, RMSE for Random Forest Regressor: 0.449
n_estimator=50, RMSE for Random Forest Regressor: 0.447
n_estimator=60, RMSE for Random Forest Regressor: 0.445
n_estimator=70, RMSE for Random Forest Regressor: 0.445
n_estimator=80, RMSE for Random Forest Regressor: 0.445
n_estimator=90, RMSE for Random Forest Regressor: 0.445
n_estimator=100, RMSE for Random Forest Regressor: 0.445
n_estimator=110, RMSE for Random Forest Regressor: 0.444
n_estimator=120, RMSE for Random Forest Regressor: 0.444
n_estimator=130, RMSE for Random Forest Regressor: 0.444
n_estimator=140, RMSE for Random Forest Regressor: 0.443
n_estimator=150, RMSE for Random Forest Regressor: 0.443
n_estimator=160, RMSE for Random Forest Regressor: 0.443
n_estimator=170, RMSE for Random Forest Regressor: 0.443
n_estimator=180, RMSE for Random Forest R