In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [72]:
data = pd.read_csv(r"C:\Users\USER\Documents\ml zoomcamp\car_fuel_efficiency.csv")

In [7]:
data.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [73]:
data.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [74]:
df = data.fillna(0)

In [75]:
df.columns = df.columns.str.lower()
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [76]:
from sklearn.model_selection import train_test_split

In [102]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [103]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

In [104]:
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [105]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

In [106]:
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)


In [107]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)


In [108]:
from sklearn.tree import DecisionTreeRegressor, export_text 

In [109]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [110]:
from sklearn.tree import export_text  
tree_text = export_text(dt, feature_names=dv.feature_names_)  
print(tree_text)

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



It splitted at `vehicle_weight`

In [115]:
from sklearn.ensemble import RandomForestRegressor

In [118]:
rt = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)

In [119]:
model_rt = rt.fit(X_train, y_train)

In [120]:
y_pred = model_rt.predict(X_val)

In [121]:
from sklearn.metrics import mean_squared_error

In [125]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {round(rmse, 4)}")

RMSE: 0.4596


In [126]:
for n in range(10, 201, 10):
    rt = RandomForestRegressor(n_estimators=n,random_state=1,n_jobs=-1)
    model_rt = rt.fit(X_train, y_train)
    y_pred = model_rt.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f" {n} -> {round(rmse, 3)}")

 10 -> 0.46
 20 -> 0.454
 30 -> 0.452
 40 -> 0.449
 50 -> 0.447
 60 -> 0.445
 70 -> 0.445
 80 -> 0.445
 90 -> 0.445
 100 -> 0.445
 110 -> 0.444
 120 -> 0.444
 130 -> 0.444
 140 -> 0.443
 150 -> 0.443
 160 -> 0.443
 170 -> 0.443
 180 -> 0.442
 190 -> 0.442
 200 -> 0.442


In [128]:
scores = []
for m in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rt = RandomForestRegressor(max_depth=m, n_estimators=n,random_state=1,n_jobs=-1)
        model_rt = rt.fit(X_train, y_train)
        y_pred = model_rt.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        scores.append((m, n, round(rmse, 3)))
        print(f" {m}, {n} -> {round(rmse, 3)}")


columns = ['max_depth', 'n_estimators', 'rmse']        
df_scores = pd.DataFrame(scores, columns=columns)


 10, 10 -> 0.45
 10, 20 -> 0.447
 10, 30 -> 0.445
 10, 40 -> 0.443
 10, 50 -> 0.442
 10, 60 -> 0.442
 10, 70 -> 0.441
 10, 80 -> 0.441
 10, 90 -> 0.442
 10, 100 -> 0.441
 10, 110 -> 0.441
 10, 120 -> 0.441
 10, 130 -> 0.441
 10, 140 -> 0.44
 10, 150 -> 0.44
 10, 160 -> 0.44
 10, 170 -> 0.44
 10, 180 -> 0.44
 10, 190 -> 0.44
 10, 200 -> 0.44
 15, 10 -> 0.458
 15, 20 -> 0.453
 15, 30 -> 0.451
 15, 40 -> 0.449
 15, 50 -> 0.446
 15, 60 -> 0.445
 15, 70 -> 0.445
 15, 80 -> 0.445
 15, 90 -> 0.445
 15, 100 -> 0.444
 15, 110 -> 0.443
 15, 120 -> 0.444
 15, 130 -> 0.444
 15, 140 -> 0.443
 15, 150 -> 0.443
 15, 160 -> 0.443
 15, 170 -> 0.443
 15, 180 -> 0.442
 15, 190 -> 0.442
 15, 200 -> 0.442
 20, 10 -> 0.459
 20, 20 -> 0.454
 20, 30 -> 0.452
 20, 40 -> 0.449
 20, 50 -> 0.447
 20, 60 -> 0.446
 20, 70 -> 0.445
 20, 80 -> 0.446
 20, 90 -> 0.446
 20, 100 -> 0.445
 20, 110 -> 0.444
 20, 120 -> 0.444
 20, 130 -> 0.444
 20, 140 -> 0.444
 20, 150 -> 0.443
 20, 160 -> 0.443
 20, 170 -> 0.443
 20, 180 

In [130]:
df_scores.sort_values(by='rmse',ascending=True)

Unnamed: 0,max_depth,n_estimators,rmse
17,10,180,0.440
14,10,150,0.440
13,10,140,0.440
15,10,160,0.440
18,10,190,0.440
...,...,...,...
61,25,20,0.454
41,20,20,0.454
20,15,10,0.458
60,25,10,0.459


THe best depth is @ 10

In [131]:
rt = RandomForestRegressor(n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1)

model = rt.fit(X_train, y_train)


In [None]:
feats = {}
for feature, importance in zip(df_train.columns, model.feature_importances_):
    feats[feature] = importance 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance', ascending=False)
# .plot(kind='bar')


Unnamed: 0,Gini-importance
origin,0.015998
engine_displacement,0.01148
vehicle_weight,0.003273
fuel_type,0.003212
drivetrain,0.002343
num_doors,0.001635
model_year,0.00036
num_cylinders,0.000357
horsepower,0.000345
acceleration,0.000325


- vehicle_weight
- horsepower
- acceleration
- engine_displacement ---- ans

In [145]:
import xgboost as xgb

In [147]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.feature_names_)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.feature_names_)

In [149]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model_xgb = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [151]:
y_pred = model_xgb.predict(dval)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"RMSE: {rmse}")

RMSE: 0.45017755678087246


In [152]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model_xgb = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [153]:
y_pred = model_xgb.predict(dval)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"RMSE: {rmse}")

RMSE: 0.42622800553359225


`eta` : 1 gave the best `rmse` value; `0.4262`