<a href="https://colab.research.google.com/github/ErendiraCan/CSE450/blob/main/CaseStudyNotebooks/Module_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Module 03 — Housing Estimates, Project

In [None]:
import pandas as pd
import numpy as np
import altair as alt

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv")
df.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price
0,1565930130,20141104T000000,4,3.25,3760,4675,2.0,0,0,3,...,2740,1020,2007,0,98038,47.3862,-122.048,3280,4033,429900.0
1,3279000420,20150115T000000,3,1.75,1460,7800,1.0,0,0,2,...,1040,420,1979,0,98023,47.3035,-122.382,1310,7865,233000.0
2,194000575,20141014T000000,4,1.0,1340,5800,1.5,0,2,3,...,1340,0,1914,0,98116,47.5658,-122.389,1900,5800,455000.0
3,2115510160,20141208T000000,3,1.75,1440,8050,1.0,0,0,3,...,1440,0,1985,0,98023,47.3187,-122.39,1790,7488,258950.0
4,7522500005,20140815T000000,2,1.5,1780,4750,1.0,0,0,4,...,1080,700,1947,0,98117,47.6859,-122.395,1690,5962,555000.0


In [None]:
# replace values in sqft columns with the median range values

# first define the ranges with the middle values
# sqft_living ranges
living_ranges = [
    (1, 699, 500), # (low value, high value, middle value to replace original value with)
    (700, 1499, 1000),
    (1500, 2499, 2000),
    (2500, 3499, 3000),
    (3500, 4499, 4000),
    (4500, 5499, 5000),
    (5500, float('inf'), 5500)
]
# sqft_lot ranges
lot_ranges = [
    (1, 2499, 1250),
    (2500, 4499, 3500),
    (4500, 6499, 5500),
    (6500, 8499, 7500),
    (8500, 10499, 9500),
    (10500, 12499, 11500),
    (12500, float('inf'), 12500)
]
# sqft_above ranges will be the same as the sqft_living ranges
# bin sqft_basement
basement_ranges = [
    (1, 499, 250),
    (500, 999, 750),
    (1000, 1499, 1250),
    (1500, 1999, 1750),
    (2000, float('inf'), 2000)
]

# sqft_living15 ranges will be the same as the sqft_living ranges
# sqft_lot15 ranges will be the same as the sqft_lot ranges

# function finds value, if value is in the range, replace it with middle value
def replace_vals(value, ranges):
  if value <= 1:
    return 0
  else:
    for low, high, middle in ranges:
        if low <= value <= high:
            return middle

# apply function to columns
df['sqft_living'] = df['sqft_living'].apply(lambda x: replace_vals(x, living_ranges))
df['sqft_lot'] = df['sqft_lot'].apply(lambda x: replace_vals(x, lot_ranges))
df['sqft_above'] = df['sqft_above'].apply(lambda x: replace_vals(x, living_ranges)) 
df['sqft_basement'] = df['sqft_basement'].apply(lambda x: replace_vals(x, basement_ranges))
df['sqft_living15'] = df['sqft_living15'].apply(lambda x: replace_vals(x, living_ranges))
df['sqft_lot15'] = df['sqft_lot15'].apply(lambda x: replace_vals(x, lot_ranges))

In [None]:
# create below average, average, and above average columns out of the grade column

# function that changes values to below avg, avg, above avg
def grade_avg(value):
  if value <= 3:
    return "below_avg"
  elif 4 <= value <= 10:
    return "avg"
  elif value <= 11:
    return "above_avg"

# apply to grade column
df['grade'] = df['grade'].apply(grade_avg)

# one-hot encode the grade column results in columns 'grade_above_avg', 'grade_avg', 'grade_below_avg'
df = pd.get_dummies(data=df, columns=['grade'])

In [None]:
# Fixing the date column
df['date'] = df['date'].str[:8].astype(int)

The model: Gradient Boosted Tree

In [None]:
# Import the libraries we need 
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,grade_above_avg,grade_avg,grade_below_avg
0,1565930130,20141104,4,3.25,4000,5500,2.0,0,0,3,...,0,98038,47.3862,-122.048,3000,3500,429900.0,0,1,0
1,3279000420,20150115,3,1.75,1000,7500,1.0,0,0,2,...,0,98023,47.3035,-122.382,1000,7500,233000.0,0,1,0
2,194000575,20141014,4,1.0,1000,5500,1.5,0,2,3,...,0,98116,47.5658,-122.389,2000,5500,455000.0,0,1,0
3,2115510160,20141208,3,1.75,1000,7500,1.0,0,0,3,...,0,98023,47.3187,-122.39,2000,7500,258950.0,0,1,0
4,7522500005,20140815,2,1.5,2000,5500,1.0,0,0,4,...,0,98117,47.6859,-122.395,2000,5500,555000.0,0,1,0


In [None]:
# Get our target variable and features and split them into test and train datasets
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'sqft_living15', 'grade_above_avg', 'grade_avg', 'grade_below_avg', 'yr_renovated', 'date']]
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create the model and train it, use default hyperparameters for now
# Define the XGBoost model
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Use squared error loss for regression
    n_estimators=100,  # Number of boosting rounds (trees)
    learning_rate=0.3,  # Learning rate (shrinkage) to control the contribution of each tree
    max_depth=6  # Maximum depth of each tree
)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

array([644447.75, 519078.  , 597759.25, ..., 432974.28, 443619.53,
       435231.25], dtype=float32)

In [None]:
# Compute the Root Mean Squared Error of the predictions
# Evaluate the model
mse = mean_squared_error(y_test, y_pred, squared=False)
print("Mean Squared Error: ", mse)

Mean Squared Error:  249052.33106358684


In [None]:
df_features = pd.DataFrame(
    {'f_names': X_train.columns, 
    'f_values': model.feature_importances_}).sort_values('f_values', ascending = False)

Metrics = (alt.Chart(df_features.query('f_values > .011'))
    .encode(
        alt.X('f_values'),
        alt.Y('f_names', sort = '-x'))
    .mark_bar())

Metrics