In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

###Question 1

In [2]:
source_data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'

df = pd.read_csv(source_data)
df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
7,-122.25,37.84,52.0,3104.0,687.0,1157.0,647.0,3.12,241400.0,NEAR BAY
8,-122.26,37.84,42.0,2555.0,665.0,1206.0,595.0,2.0804,226700.0,NEAR BAY
9,-122.25,37.84,52.0,3549.0,707.0,1551.0,714.0,3.6912,261100.0,NEAR BAY


In [3]:
def data_preparation(df):
    df_housing = df[(df["ocean_proximity"] == "<1H OCEAN") | (df["ocean_proximity"] == "<1H OCEAN")].copy()

    df_housing["median_house_value"] = np.log1p(df_housing["median_house_value"])

    df_housing.fillna(0, inplace=True)

    X = df_housing.drop(columns=['median_house_value'])
    y = df_housing['median_house_value']

    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = data_preparation(df)

In [4]:
dv = DictVectorizer(sparse=True)

X_train_dv = dv.fit_transform(X_train.to_dict(orient='records'))
X_test_dv = dv.fit_transform(X_test.to_dict(orient='records'))
X_val_dv = dv.fit_transform(X_val.to_dict(orient='records'))


In [5]:
feature_names = [i for i in dv.get_feature_names_out()]

regressor = DecisionTreeRegressor(max_depth=1)
regressor.fit(X_train_dv, y_train)

feature_index = regressor.tree_.feature[0]

feature_name = feature_names[feature_index]

print("Feature used for splitting the data:", feature_name)

Feature used for splitting the data: median_income


In [10]:
export_text(regressor, feature_names=[i for i in dv.get_feature_names_out()])

'|--- median_income <= 5.04\n|   |--- value: [12.15]\n|--- median_income >  5.04\n|   |--- value: [12.68]\n'

### Question 2

In [13]:
rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train_dv, y_train)

y_val_pred = rf_model.predict(X_val_dv)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("RMSE on validation data:", rmse)

RMSE on validation data: 0.21282105475177593


### Question 3

In [17]:
n_estimators_range = range(10, 201, 10)

best_rmse = float('inf')
best_n_estimators = 0

for n_estimators in n_estimators_range:
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=1)
    rf_model.fit(X_train_dv, y_train)

    y_val_pred = rf_model.predict(X_val_dv)

    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    print(f"n_estimators={n_estimators}: RMSE on validation data = {rmse:.5f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_n_estimators = n_estimators
    else:
        break

print(f"The best n_estimators is {best_n_estimators} with RMSE = {best_rmse:.5f}")

n_estimators=10: RMSE on validation data = 0.21282
n_estimators=20: RMSE on validation data = 0.20844
n_estimators=30: RMSE on validation data = 0.20766
n_estimators=40: RMSE on validation data = 0.20668
n_estimators=50: RMSE on validation data = 0.20674
The best n_estimators is 40 with RMSE = 0.20668


### Question 4

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

max_depth_range = [10, 15, 20, 25]
n_estimators_range = range(10, 201, 10)

best_max_depth = None
best_n_estimators = None
best_rmse = float('inf')

for max_depth in max_depth_range:
    for n_estimators in n_estimators_range:
        rf_model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, random_state=1)
        rf_model.fit(X_train_dv, y_train)

        y_val_pred = rf_model.predict(X_val_dv)

        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

        print(f"max_depth={max_depth}, n_estimators={n_estimators}: RMSE on validation data = {rmse:.5f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_max_depth = max_depth
            best_n_estimators = n_estimators

print(f"The best max_depth is {best_max_depth} with n_estimators={best_n_estimators} and RMSE = {best_rmse:.5f}")


max_depth=10, n_estimators=10: RMSE on validation data = 0.21633
max_depth=10, n_estimators=20: RMSE on validation data = 0.21487
max_depth=10, n_estimators=30: RMSE on validation data = 0.21385
max_depth=10, n_estimators=40: RMSE on validation data = 0.21180
max_depth=10, n_estimators=50: RMSE on validation data = 0.21215
max_depth=10, n_estimators=60: RMSE on validation data = 0.21229
max_depth=10, n_estimators=70: RMSE on validation data = 0.21245
max_depth=10, n_estimators=80: RMSE on validation data = 0.21178
max_depth=10, n_estimators=90: RMSE on validation data = 0.21183
max_depth=10, n_estimators=100: RMSE on validation data = 0.21186
max_depth=10, n_estimators=110: RMSE on validation data = 0.21157
max_depth=10, n_estimators=120: RMSE on validation data = 0.21155
max_depth=10, n_estimators=130: RMSE on validation data = 0.21162
max_depth=10, n_estimators=140: RMSE on validation data = 0.21151
max_depth=10, n_estimators=150: RMSE on validation data = 0.21148
max_depth=10, n_est

### Question 5

In [20]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train_dv, y_train)

feature_importances = rf_model.feature_importances_

feature_importance_dict = {feature_name: importance for feature_name, importance in zip(feature_names, feature_importances)}

most_important_feature = max(feature_importance_dict, key=feature_importance_dict.get)

print("Most important feature:", most_important_feature)


Most important feature: median_income


### Question 6

In [24]:
dtrain = xgb.DMatrix(X_train_dv, label=y_train)
dval = xgb.DMatrix(X_val_dv, label=y_val)

watchlist = [(dval, 'validation')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

num_round = 100
model = xgb.train(xgb_params, dtrain, num_round, watchlist)

y_val_pred = model.predict(dval)

initial_rmse = np.sqrt(np.mean((y_val - y_val_pred) ** 2))

xgb_params['eta'] = 0.1

model = xgb.train(xgb_params, dtrain, num_round, watchlist)

y_val_pred = model.predict(dval)

updated_rmse = np.sqrt(np.mean((y_val - y_val_pred) ** 2))

print(f"Initial eta (0.3) RMSE: {initial_rmse:.3f}")
print(f"Updated eta (0.1) RMSE: {updated_rmse:.3f}")

[0]	validation-rmse:0.34901
[1]	validation-rmse:0.30254
[2]	validation-rmse:0.27216
[3]	validation-rmse:0.25683




[4]	validation-rmse:0.24315
[5]	validation-rmse:0.23635
[6]	validation-rmse:0.23120
[7]	validation-rmse:0.22864
[8]	validation-rmse:0.22659
[9]	validation-rmse:0.22352
[10]	validation-rmse:0.22178
[11]	validation-rmse:0.22098
[12]	validation-rmse:0.21912
[13]	validation-rmse:0.21802
[14]	validation-rmse:0.21714
[15]	validation-rmse:0.21699
[16]	validation-rmse:0.21457
[17]	validation-rmse:0.21296
[18]	validation-rmse:0.21177
[19]	validation-rmse:0.21158
[20]	validation-rmse:0.21106
[21]	validation-rmse:0.21129
[22]	validation-rmse:0.21094
[23]	validation-rmse:0.21056
[24]	validation-rmse:0.21063
[25]	validation-rmse:0.21052
[26]	validation-rmse:0.21042
[27]	validation-rmse:0.20929
[28]	validation-rmse:0.20920
[29]	validation-rmse:0.20918
[30]	validation-rmse:0.20928
[31]	validation-rmse:0.20708
[32]	validation-rmse:0.20702
[33]	validation-rmse:0.20713
[34]	validation-rmse:0.20669
[35]	validation-rmse:0.20619
[36]	validation-rmse:0.20623
[37]	validation-rmse:0.20620
[38]	validation-rmse