# Car Fuel Efficiency Prediction - Homework

Predicting fuel efficiency (MPG) using regression models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error

## Data Preparation

In [None]:
df = pd.read_csv('car_fuel_efficiency.csv')
df = df.fillna(0)

y = df['fuel_efficiency_mpg'].values
X = df.drop('fuel_efficiency_mpg', axis=1)

X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=1)

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))

## Q1: Decision Tree - Split Feature

In [None]:
model = DecisionTreeRegressor(max_depth=1, random_state=1)
model.fit(X_train, y_train)
print(f'Split feature: {dv.feature_names_[model.tree_.feature[0]]}')

## Q2: Random Forest - RMSE with n_estimators=10

In [None]:
model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE: {rmse:.2f}')

## Q3: n_estimators - When RMSE stops improving

In [None]:
best_rmse = float('inf')
for n in range(10, 210, 10):
    model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f'n_estimators={n}: RMSE={rmse:.3f}')
    if round(rmse, 3) >= round(best_rmse, 3):
        print(f'RMSE stopped improving at n_estimators={n}')
        break
    best_rmse = rmse

## Q4: Best max_depth

In [None]:
best_depth = None
best_mean_rmse = float('inf')

for depth in [10, 15, 20, 25]:
    rmse_scores = []
    for n in range(10, 210, 10):
        model = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)
    mean_rmse = np.mean(rmse_scores)
    print(f'max_depth={depth}: mean RMSE={mean_rmse:.3f}')
    if mean_rmse < best_mean_rmse:
        best_mean_rmse = mean_rmse
        best_depth = depth

print(f'\nBest max_depth: {best_depth}')

## Q5: Feature Importance

In [None]:
model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)

features = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']
for feature in features:
    idx = dv.feature_names_.index(feature)
    print(f'{feature}: {model.feature_importances_[idx]:.3f}')

## Q6: XGBoost - Best eta

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]

for eta in [0.3, 0.1]:
    xgb_params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
    y_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f'eta={eta}: RMSE={rmse:.4f}')