In [38]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor,export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2023-10-22 21:43:03--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1,4M) [text/plain]
Saving to: ‘housing.csv’


2023-10-22 21:43:05 (834 KB/s) - ‘housing.csv’ saved [1423529/1423529]



In [3]:
df = pd.read_csv("housing.csv")
df = df[df["ocean_proximity"].isin(['<1H OCEAN','INLAND'])]

In [28]:
df = df.fillna(0)
df["median_house_value"] = np.log1p(df["median_house_value"])

In [29]:
def train_test_split(df, seed=42):
    train_size = int(len(df)*0.6)
    val_size = int(len(df)*0.2)
    test_size = len(df) - train_size - val_size

    idx = np.arange(len(df))
    rng = np.random.default_rng(seed=seed)
    rng.shuffle(idx)

    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:train_size].copy()
    df_val = df_shuffled.iloc[train_size:train_size+val_size].copy()
    df_test = df_shuffled.iloc[train_size+val_size:].copy()

    return df_train, df_val, df_test

In [30]:
df_train, df_val, df_test = train_test_split(df, seed=1)

In [31]:
y_train = df_train["median_house_value"]
y_val = df_val["median_house_value"]
y_test = df_test["median_house_value"]

df_train = df_train.drop(columns=["median_house_value"])
df_val = df_val.drop(columns=["median_house_value"])
df_test = df_test.drop(columns=["median_house_value"])

In [32]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.fit_transform(df_val.to_dict(orient='records'))
X_test = dv.fit_transform(df_test.to_dict(orient='records'))

### Q1

In [33]:
model = DecisionTreeRegressor(max_depth=1)
model.fit(X_train, y_train)

In [34]:
export_text(model, feature_names=list(dv.get_feature_names_out()))

'|--- ocean_proximity=<1H OCEAN <= 0.50\n|   |--- value: [11.61]\n|--- ocean_proximity=<1H OCEAN >  0.50\n|   |--- value: [12.30]\n'

### Q2

In [37]:
model = RandomForestRegressor(n_estimators=10, n_jobs=-1,random_state=1)
model.fit(X_train, y_train)

In [44]:
mean_squared_error(y_val,model.predict(X_val), squared=False)

0.22976858158035496

### Q3

In [45]:
for n_est in range(10,201,10):
    model = RandomForestRegressor(n_estimators=n_est, n_jobs=-1,random_state=1)
    model.fit(X_train, y_train)
    print(n_est, mean_squared_error(y_val,model.predict(X_val), squared=False))

10 0.24261611305318878
20 0.23524056973299418
30 0.23326912290580457
40 0.2319530802786454
50 0.23073652240936687
60 0.23065529850640856
70 0.2305765917144621
80 0.23069788342344838
90 0.2302329482329426
100 0.23025904655921983
110 0.22995497870594855
120 0.2296259031345593
130 0.2294438044140307
140 0.22961378279133113
150 0.22953427483028388
160 0.22954843975404315
170 0.22955278631951934
180 0.22954760033749538
190 0.22969661022690582
200 0.22976858158035493


### Q4

In [46]:
for m_depth in [10, 15, 20, 25]:
    rmse = []
    for n_est in range(10,201,10):
        model = RandomForestRegressor(n_estimators=n_est,max_depth=m_depth, n_jobs=-1,random_state=1)
        model.fit(X_train, y_train)
        rmse.append(mean_squared_error(y_val,model.predict(X_val), squared=False))
    print(m_depth, np.mean(rmse))

10 0.24287294768715112
15 0.2327865918954113
20 0.23125852291051413
25 0.231249984251464


### Q5

In [54]:
model.feature_importances_

array([0.01742022, 0.03114647, 0.09310208, 0.08711062, 0.34415052,
       0.21465042, 0.14442931, 0.02992868, 0.01790362, 0.02015805])

In [53]:
model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)

importances = {}
for feature, importance in zip(df_train.columns, model.feature_importances_):
    importances[feature] = importance
importances

{'longitude': 0.0174202219966759,
 'latitude': 0.03114647418587977,
 'housing_median_age': 0.0931020832212001,
 'total_rooms': 0.08711062440261289,
 'total_bedrooms': 0.3441505165270948,
 'population': 0.21465041655134534,
 'households': 0.14442930581347832,
 'median_income': 0.0299286802260822,
 'ocean_proximity': 0.017903624325647315}

### Q6

In [55]:
import xgboost as xgb

In [56]:
xgb_params = {
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
for eta in [0.1, 0.3]:
    xgb_params["eta"] = eta
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_train, y_train)
    print(eta, mean_squared_error(y_val,model.predict(X_val), squared=False))


0.1 0.22712340881493367
0.3 0.22825158478359628
