In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# optional: suppress warnings
import warnings
warnings.filterwarnings("ignore")



: 

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")

# Fill missing values with zeros
df = df.fillna(0)

# Define target
y = df.fuel_efficiency_mpg

# Define features
X = df.drop(columns=['fuel_efficiency_mpg'])

# Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

# Convert to dicts for DictVectorizer
train_dicts = X_train.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

# Vectorize
dv = DictVectorizer(sparse=True)
X_train_dv = dv.fit_transform(train_dicts)
X_val_dv = dv.transform(val_dicts)

print("âœ… Data prepared. Train shape:", X_train_dv.shape)


In [None]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train_dv, y_train)

feature = dv.feature_names_[dt.tree_.feature[0]]
print("ðŸŒ³ Feature used for first split:", feature)


In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

y_pred = rf.predict(X_val_dv)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"ðŸŽ¯ RMSE on validation: {rmse:.3f}")


In [None]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_dv, y_train)
    y_pred = rf.predict(X_val_dv)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    scores.append((n, rmse))

scores_df = pd.DataFrame(scores, columns=['n_estimators', 'RMSE'])
display(scores_df)


In [None]:
depth_values = [10, 15, 20, 25]
depth_scores = []

for d in depth_values:
    rmse_list = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(max_depth=d, n_estimators=n, random_state=1, n_jobs=-1)
        rf.fit(X_train_dv, y_train)
        y_pred = rf.predict(X_val_dv)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        rmse_list.append(rmse)
    mean_rmse = np.mean(rmse_list)
    depth_scores.append((d, mean_rmse))

depth_df = pd.DataFrame(depth_scores, columns=['max_depth', 'mean_RMSE'])
display(depth_df)


In [None]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train_dv, y_train)

importance = pd.Series(rf.feature_importances_, index=dv.feature_names_)
importance = importance.sort_values(ascending=False)
display(importance.head(10))


In [None]:
features = X_train.columns.tolist()

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

xgb_params_01 = xgb_params_03.copy()
xgb_params_01['eta'] = 0.1

print("Training with eta=0.3")
model_03 = xgb.train(xgb_params_03, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

print("Training with eta=0.1")
model_01 = xgb.train(xgb_params_01, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

y_pred_03 = model_03.predict(dval)
y_pred_01 = model_01.predict(dval)

rmse_03 = mean_squared_error(y_val, y_pred_03, squared=False)
rmse_01 = mean_squared_error(y_val, y_pred_01, squared=False)

print(f"RMSE eta=0.3: {rmse_03:.3f}")
print(f"RMSE eta=0.1: {rmse_01:.3f}")
