In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
df = pd.read_csv(url)

# Keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'
df = df[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

# Fill missing values with zeros
df.fillna(0, inplace=True)

# Apply the log transform to median_house_value
df['median_house_value'] = df['median_house_value'].apply(lambda x: 0 if x == 0 else np.log(x))

# Train/validation/test split with 60%/20%/20% distribution
train, test = train_test_split(df, test_size=0.4, random_state=1)
val, test = train_test_split(test, test_size=0.5, random_state=1)

# Separate features and target variable
X_train, y_train = train.drop('median_house_value', axis=1), train['median_house_value']
X_val, y_val = val.drop('median_house_value', axis=1), val['median_house_value']
X_test, y_test = test.drop('median_house_value', axis=1), test['median_house_value']

# Use DictVectorizer(sparse=True) to turn the dataframes into matrices
dv = DictVectorizer(sparse=True)
X_train_encoded = dv.fit_transform(X_train.to_dict(orient='records'))
X_val_encoded = dv.transform(X_val.to_dict(orient='records'))
X_test_encoded = dv.transform(X_test.to_dict(orient='records'))


<h1>Question 1</h1>

In [3]:
# Train a model with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_encoded, y_train)

# Print the feature used for splitting
print(dv.get_feature_names_out()[dt.tree_.feature[0]])


ocean_proximity=INLAND


<h1>Question 2</h1>

In [4]:
# Train a random forest model
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_encoded, y_train)

# Predict on validation set
y_val_pred = rf.predict(X_val_encoded)

# Calculate RMSE
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print("RMSE on validation set:", rmse_val)


RMSE on validation set: 0.23485107722274248


<h1>Question 3</h1>

In [5]:
# Experiment with n_estimators parameter
for n_estimators in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    rf.fit(X_train_encoded, y_train)
    y_val_pred = rf.predict(X_val_encoded)
    rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
    
    print(f"n_estimators: {n_estimators}, RMSE on validation set: {rmse_val}")
    if n_estimators > 10 and rmse_val >= rmse_prev:
        break  # Stop when RMSE stops improving
    rmse_prev = rmse_val


n_estimators: 10, RMSE on validation set: 0.23485107722274248
n_estimators: 20, RMSE on validation set: 0.22615301966217238
n_estimators: 30, RMSE on validation set: 0.22362689281185794
n_estimators: 40, RMSE on validation set: 0.22311566278809997
n_estimators: 50, RMSE on validation set: 0.2211582673155134
n_estimators: 60, RMSE on validation set: 0.22108969380899765
n_estimators: 70, RMSE on validation set: 0.22082077989874652
n_estimators: 80, RMSE on validation set: 0.2205334639808659
n_estimators: 90, RMSE on validation set: 0.22097831679278654


<h1>Question 4</h1>

In [6]:
best_rmse = float('inf')
best_max_depth = None
best_n_estimators = None

for max_depth in [10, 15, 20, 25]:
    for n_estimators in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train_encoded, y_train)
        y_val_pred = rf.predict(X_val_encoded)
        rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

        print(f"max_depth: {max_depth}, n_estimators: {n_estimators}, RMSE on validation set: {rmse_val}")

        if rmse_val < best_rmse:
            best_rmse = rmse_val
            best_max_depth = max_depth
            best_n_estimators = n_estimators

print(f"Best max_depth: {best_max_depth}, Best n_estimators: {best_n_estimators}, Best RMSE: {best_rmse}")


max_depth: 10, n_estimators: 10, RMSE on validation set: 0.24133680565525462
max_depth: 10, n_estimators: 20, RMSE on validation set: 0.2368719550416235
max_depth: 10, n_estimators: 30, RMSE on validation set: 0.23434639388512057
max_depth: 10, n_estimators: 40, RMSE on validation set: 0.23394673037431934
max_depth: 10, n_estimators: 50, RMSE on validation set: 0.23283427818772456
max_depth: 10, n_estimators: 60, RMSE on validation set: 0.23317542397678728
max_depth: 10, n_estimators: 70, RMSE on validation set: 0.23287287981908927
max_depth: 10, n_estimators: 80, RMSE on validation set: 0.23239224280008214
max_depth: 10, n_estimators: 90, RMSE on validation set: 0.23239765537966434
max_depth: 10, n_estimators: 100, RMSE on validation set: 0.23221206605089234
max_depth: 10, n_estimators: 110, RMSE on validation set: 0.23174186245592757
max_depth: 10, n_estimators: 120, RMSE on validation set: 0.23177853273281301
max_depth: 10, n_estimators: 130, RMSE on validation set: 0.23164232126369

<h1>Question 5</h1>

In [7]:
# Train the model with specified parameters
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train_encoded, y_train)

# Get feature importance information
feature_importance = pd.Series(rf.feature_importances_, index=dv.get_feature_names_out())

# Print the most important feature
print("Most Important Feature:", feature_importance.idxmax())


Most Important Feature: median_income


<h1>Question 6</h1>

In [8]:
# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train_encoded, label=y_train)
dval = xgb.DMatrix(X_val_encoded, label=y_val)

# Create a watchlist
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Train a model with different values of eta
etas = [0.3, 0.1]

for eta in etas:
    xgb_params = {
        'eta': eta,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1,
    }

    model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)
    y_val_pred = model.predict(dval)
    rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

    print(f"eta: {eta}, RMSE on validation set: {rmse_val}")

# The best eta is the one that leads to the lowest RMSE on the validation set


eta: 0.3, RMSE on validation set: 0.2167682140766952
eta: 0.1, RMSE on validation set: 0.21898674893657916
