In [3]:
import numpy as np
import pandas as pd

### Dataset

In this homework, we will use the California Housing Prices from [Kaggle](https://www.kaggle.com/datasets/camnugent/california-housing-prices).

Here's a wget-able [link](https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv):

```bash
wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
```

The goal of this homework is to create a regression model for predicting housing prices (column `'median_house_value'`).


### Preparing the dataset 

For this homework, we only want to use a subset of data. This is the same subset we used in homework #2.

First, keep only the records where `ocean_proximity` is either `'<1H OCEAN'` or `'INLAND'`

Preparation:

* Fill missing values with zeros.
* Apply the log tranform to `median_house_value`.
* Do train/validation/test split with 60%/20%/20% distribution. 
* Use the `train_test_split` function and set the `random_state` parameter to 1.
* Use `DictVectorizer(sparse=True)` to turn the dataframe into matrices.

In [4]:
data = pd.read_csv("data/housing.csv")
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
data.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [16]:
#First, keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'
df = data.loc[(data.ocean_proximity == '<1H OCEAN') | (data.ocean_proximity == 'INLAND')]

In [17]:
df.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN    9136
INLAND       6551
Name: count, dtype: int64

In [19]:
df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [20]:
# Fill missing values with zeros.
df = df.fillna(0)

In [21]:
df.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [22]:
# before
df.median_house_value.head()

701    431000.0
830    217000.0
859    247600.0
860    283500.0
861    216900.0
Name: median_house_value, dtype: float64

In [23]:
# Apply the log tranform to median_house_value
df['median_house_value'] = np.log1p(df['median_house_value'])

In [24]:
df.median_house_value.head()

701    12.973866
830    12.287657
859    12.419574
860    12.554971
861    12.287196
Name: median_house_value, dtype: float64

In [25]:
# Do train/validation/test split with 60%/20%/20% distribution.
from sklearn.model_selection import train_test_split

In [28]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)


In [29]:
len(df_train_full), len(df_train), len(df_val), len(df_test)

(12549, 9411, 3138, 3138)

In [30]:
# Use DictVectorizer(sparse=True) to turn the dataframe into matrices.
from sklearn.feature_extraction import DictVectorizer

In [32]:
def prepare_dfs(df):
    df = df.copy()
    
    df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
    df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

    y_train_full = df_train_full['median_house_value'].values
    y_train = df_train['median_house_value'].values
    y_val = df_val['median_house_value'].values
    y_test = df_test['median_house_value'].values

    del df_train_full['median_house_value']
    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    return df_train_full, df_train, df_val, df_test, y_train_full, y_train, y_val, y_test

    



## Question 1

Let's train a decision tree regressor to predict the `median_house_value` variable. 

* Train a model with `max_depth=1`.


Which feature is used for splitting the data?

* `ocean_proximity`
* `total_rooms`
* `latitude`
* `population`


In [33]:
df_train_full, df_train, df_val, df_test, y_train_full, y_train, y_val, y_test = prepare_dfs(df)

In [38]:
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.metrics import mean_squared_error
from sklearn.tree import export_text

In [44]:
dv = DictVectorizer(sparse=False)

dicts_train = df_train.to_dict(orient='records')
dicts_val = df_val.to_dict(orient='records')

X_train = dv.fit_transform(dicts_train)
X_val = dv.transform(dicts_val)

dt = DecisionTreeRegressor(max_depth=1) 
dt.fit(X_train, y_train)
y_pred = dt.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

0.45168599736547216

In [45]:
print(export_text(dt, feature_names=dv.feature_names_))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



answer: ocean_proximity=<1H OCEAN

## Question 2

Train a random forest model with these parameters:

* `n_estimators=10`
* `random_state=1`
* `n_jobs=-1` (optional - to make training faster)


What's the RMSE of this model on validation?

* 0.045
* 0.245
* 0.545
* 0.845

In [46]:
from sklearn.ensemble import RandomForestRegressor

In [48]:
# dv = DictVectorizer(sparse=False)

# dicts_train = df_train.to_dict(orient='records')
# dicts_val = df_val.to_dict(orient='records')

# X_train = dv.fit_transform(dicts_train)
# X_val = dv.transform(dicts_val)

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1) 
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

0.24472888684076877

answer: 0.245

## Question 3

Now let's experiment with the `n_estimators` parameter

* Try different values of this parameter from 10 to 200 with step 10.
* Set `random_state` to `1`.
* Evaluate the model on the validation dataset.


After which value of `n_estimators` does RMSE stop improving?

- 10
- 25
- 50
- 160


In [50]:
for n in range(10, 201, 10):
    # dv = DictVectorizer(sparse=False)

    # dicts_train = df_train.to_dict(orient='records')
    # dicts_val = df_val.to_dict(orient='records')
    
    # X_train = dv.fit_transform(dicts_train)
    # X_val = dv.transform(dicts_val)
    
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1) 
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = round(mean_squared_error(y_val, y_pred, squared=False), 5)
    
    print(f"n_estimators: {n} ---->  {rmse}")
    print()

n_estimators: 10 ---->  0.24473

n_estimators: 20 ---->  0.23825

n_estimators: 30 ---->  0.23623

n_estimators: 40 ---->  0.23458

n_estimators: 50 ---->  0.23449

n_estimators: 60 ---->  0.23415

n_estimators: 70 ---->  0.23415

n_estimators: 80 ---->  0.23431

n_estimators: 90 ---->  0.2343

n_estimators: 100 ---->  0.23416

n_estimators: 110 ---->  0.23413

n_estimators: 120 ---->  0.23386

n_estimators: 130 ---->  0.23377

n_estimators: 140 ---->  0.23362

n_estimators: 150 ---->  0.23351

n_estimators: 160 ---->  0.23332

n_estimators: 170 ---->  0.23332

n_estimators: 180 ---->  0.23352

n_estimators: 190 ---->  0.2338

n_estimators: 200 ---->  0.23374



answer: I am going to say 160 because the question didnt say anything about rounding the rmse to 3 points, otherwise I would say 50

## Question 4

Let's select the best `max_depth`:

* Try different values of `max_depth`: `[10, 15, 20, 25]`
* For each of these values, try different values of `n_estimators` from 10 till 200 (with step 10)
* Fix the random seed: `random_state=1`


What's the best `max_depth`:

* 10
* 15
* 20
* 25


In [54]:
min_rmse = float('inf')

for d in [10, 15, 20, 25]:
    print(f"max_depth: {d}")
    print()
    rmses = []
    
    for n in range(10, 201, 10):
    
    
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=1, n_jobs=-1) 
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = round(mean_squared_error(y_val, y_pred, squared=False), 5)
        if rmse < min_rmse:
            min_rmse = rmse
            res = (d, n, rmse)
        rmses.append(rmse)
        
        print(f"    n_estimators: {n:<10} ---->  {rmse:.5f}")
        # print()
    avg_rmse = np.mean(rmses)
    print(f"avg rmse {avg_rmse}")

best_md, best_n_esitmators, best_rmse = res
print()
print(f"The lowest rmse was {best_rmse} at a max_depth of {best_md}, and n_esitmators of {best_n_esitmators}")
    

max_depth: 10

    n_estimators: 10         ---->  0.25051
    n_estimators: 20         ---->  0.24726
    n_estimators: 30         ---->  0.24626
    n_estimators: 40         ---->  0.24509
    n_estimators: 50         ---->  0.24562
    n_estimators: 60         ---->  0.24547
    n_estimators: 70         ---->  0.24544
    n_estimators: 80         ---->  0.24561
    n_estimators: 90         ---->  0.24550
    n_estimators: 100        ---->  0.24538
    n_estimators: 110        ---->  0.24527
    n_estimators: 120        ---->  0.24500
    n_estimators: 130        ---->  0.24476
    n_estimators: 140        ---->  0.24459
    n_estimators: 150        ---->  0.24455
    n_estimators: 160        ---->  0.24445
    n_estimators: 170        ---->  0.24439
    n_estimators: 180        ---->  0.24449
    n_estimators: 190        ---->  0.24469
    n_estimators: 200        ---->  0.24473
avg rmse 0.245453
max_depth: 15

    n_estimators: 10         ---->  0.24553
    n_estimators: 20        