# Week 6 Homework #
## Prep Data ##

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import export_text
from matplotlib import pyplot as plt

In [3]:
# read data into dataframe
data = "./data/housing.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [5]:
# remove all rows except where ocean_proximity = <1H OCEAN or INLAND
df.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [6]:
df = df[(df['ocean_proximity'] == "<1H OCEAN") | (df['ocean_proximity'] == "INLAND")]

In [7]:
df.shape

(15687, 10)

In [8]:
# fill mizzing values with 0
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [9]:
df.total_bedrooms = df.total_bedrooms.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.total_bedrooms = df.total_bedrooms.fillna(0)


In [10]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [11]:
# apply log transfrom to median_house_value
df['median_house_value'] = np.log(df['median_house_value'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['median_house_value'] = np.log(df['median_house_value'])


In [12]:
df['median_house_value'].head()

701    12.973863
830    12.287653
859    12.419570
860    12.554967
861    12.287192
Name: median_house_value, dtype: float64

In [13]:
df['median_house_value'].head()

701    12.973863
830    12.287653
859    12.419570
860    12.554967
861    12.287192
Name: median_house_value, dtype: float64

In [14]:
# split data into train/validation/test split with 60%/20%/20% distribution
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [15]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [16]:
y_train = df_train.median_house_value
y_val = df_val.median_house_value
y_test = df_test.median_house_value

In [17]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

### 1. Most important split feature ##

In [None]:
# train model
train_dicts = df_train.to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [None]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [None]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
y_pred = dt.predict(X_val)
y_pred

In [None]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

## 2. RMSE for Random Forest ##

In [None]:
# train random forest model
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
y_pred

In [None]:
mse = mean_squared_error(y_val, y_pred)

rmse = np.sqrt(mse).round(3)

print("Root Mean Squared Error (RMSE):", rmse)

## 3. n_estimators ##

In [None]:
scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred)).round(3)
    
    scores.append((n, rmse))

In [None]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])

In [None]:
plt.plot(df_scores.n_estimators, df_scores.rmse)

In [None]:
n_estimators = 160

## 4: max_depth ##

In [None]:
scores = []

for d in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n,
                                    max_depth=d,
                                    random_state=1,
                                    n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred)).round(3)

        scores.append((d, n, rmse))

In [None]:
columns = ['max_depth', 'n_estimators', 'rmse']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
for d in [10, 15, 20, 25]:
    df_subset = df_scores[df_scores.max_depth == d]
    
    plt.plot(df_subset.n_estimators, df_subset.rmse,
             label='max_depth=%d' % d)

plt.legend()

In [None]:
max_depth = 20

## 5. feature importance ##

In [None]:
# train random forest model
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
y_pred

In [None]:
mse = mean_squared_error(y_val, y_pred)

rmse = np.sqrt(mse).round(3)

print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
df_train.columns

In [None]:
# Get feature importances
feature_importances = rf.feature_importances_

In [None]:
rf.feature_importances_

In [None]:
# Match feature importances to the column names in your dataset (assuming you have a DataFrame)
feature_names = df_train.columns  # Replace X_train with your feature data if it's a NumPy array

# Create a DataFrame to display the feature importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print or visualize the feature importances
print(importance_df)

## 6: eta in XGBoost ##

In [19]:
# train XGBoost model
train_dicts = df_train.to_dict(orient='records')

In [23]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [24]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [32]:
dv.get_feature_names_out()

array(['households', 'housing_median_age', 'latitude', 'longitude',
       'median_income', 'ocean_proximity=<1H OCEAN',
       'ocean_proximity=INLAND', 'population', 'total_bedrooms',
       'total_rooms'], dtype=object)

In [39]:
# find features and reaplce and invalid characters
features = list(dv.get_feature_names_out())
features = np.char.replace(features, '<', 'lt')

['households' 'housing_median_age' 'latitude' 'longitude' 'median_income'
 'ocean_proximity=lt1H OCEAN' 'ocean_proximity=INLAND' 'population'
 'total_bedrooms' 'total_rooms']


In [40]:
#features = dv.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [41]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [44]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-rmse:10.37455	val-rmse:10.37544
[5]	train-rmse:6.13433	val-rmse:6.13232
[10]	train-rmse:3.63299	val-rmse:3.62935
[15]	train-rmse:2.15901	val-rmse:2.15610
[20]	train-rmse:1.29412	val-rmse:1.29323
[25]	train-rmse:0.79037	val-rmse:0.79378
[30]	train-rmse:0.50293	val-rmse:0.51232
[35]	train-rmse:0.34562	val-rmse:0.36335
[40]	train-rmse:0.26667	val-rmse:0.29287
[45]	train-rmse:0.22839	val-rmse:0.26224
[50]	train-rmse:0.21073	val-rmse:0.24891
[55]	train-rmse:0.20171	val-rmse:0.24333
[60]	train-rmse:0.19658	val-rmse:0.24074
[65]	train-rmse:0.19216	val-rmse:0.23884
[70]	train-rmse:0.18898	val-rmse:0.23797
[75]	train-rmse:0.18514	val-rmse:0.23628
[80]	train-rmse:0.18135	val-rmse:0.23456
[85]	train-rmse:0.17814	val-rmse:0.23394
[90]	train-rmse:0.17476	val-rmse:0.23252
[95]	train-rmse:0.17213	val-rmse:0.23164
[99]	train-rmse:0.16985	val-rmse:0.23123
