# Seattle Housing Project

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np

from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

## Overview and Preprocessing of Dataset

In [None]:
df = pd.read_csv("king_ country_ houses_aa.csv")

In [None]:
df.head()

In [None]:
# move target to the right
column_data = df.pop("price")
df["price"] = column_data
df.head()

In [None]:
df.shape

In [None]:
# there are no nans
#df.isna().sum()

In [None]:
# there are no duplicates
#df.duplicated().sum()

In [None]:
#df.eq(" ").sum()

In [None]:
df.dtypes

In [None]:
# convert date feature to datetime
df["date"] = pd.to_datetime(df["date"])

In [None]:
df.head()

In [None]:
df.describe().T

## Visualize the Data

In [None]:
color = '#B7410E'

# grid size
nrows, ncols = 6, 4  # adjust for your number of features

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

# Plot each numerical feature
for i, ax in enumerate(axes):
    if i >= len(df.columns):
        ax.set_visible(False)  # hide unesed plots
        continue
    ax.hist(df.iloc[:, i], bins=30, color=color, edgecolor='black')
    ax.set_title(df.columns[i], fontsize=20)

plt.tight_layout()
plt.show()

#### First impression from histograms:

Uniform distributions (drop):

- id
- date

No inferential power (drop):

- yr_renovated
- sqft_lot 15

Geographical data (drop):

- zipcode
- lat
- long

Categoricals (drop?):

- bathrooms
- floors
- waterfront
- view
- condition
- grade


Variables to consider in the model (apply standarzation or normalization):

- bedrooms
- sqft_living
- sqft_above
- yr_built
- sqft_living15

Drop first and add later?:

- sqft_lot -  check for outliers
- sqft_basement - try normalization

In [None]:
color = '#B7410E'

# grid size
nrows, ncols = 5, 4

#Excluding date to present boxplots

df_boxplot = df.drop(columns={"date"})

fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 16))

axes = axes.flatten()

for i, ax in enumerate(axes):
    if i >= len(df_boxplot.columns):
        ax.set_visible(False)
        continue
    ax.boxplot(df_boxplot.iloc[:, i].dropna(), vert=False, patch_artist=True,
               boxprops=dict(facecolor=color, color='black'),
               medianprops=dict(color='yellow'), whiskerprops=dict(color='black'),
               capprops=dict(color='black'), flierprops=dict(marker='o', color='red', markersize=5))
    ax.set_title(df_boxplot.columns[i], fontsize=10)
    ax.tick_params(axis='x', labelsize=8)

plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = df.corr()
mask = np.zeros_like(correlation_matrix)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(25, 15))
sns.set(font_scale=1.5)
ax = sns.heatmap(correlation_matrix, mask=mask, annot=True, annot_kws={"size": 17}, linewidths=.5, cmap="coolwarm", fmt=".2f", ax=ax)
ax.set_title("Checking for Correlations", fontsize=30)
plt.show()

In [None]:
df1 = df[["bedrooms", "sqft_living", "sqft_above", "yr_built", "sqft_living15", "grade", "price"]]

In [None]:
df1.corrwith(df1["price"]).sort_values(ascending=False) #checking correlations with the target

In [None]:
correlation_matrix = df1.corr()
mask = np.zeros_like(correlation_matrix)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(25, 15))
sns.set(font_scale=1.5)
ax = sns.heatmap(correlation_matrix, mask=mask, annot=True, annot_kws={"size": 17}, linewidths=.5, cmap="coolwarm", fmt=".2f", ax=ax)
ax.set_title("Checking for Multicollinearity", fontsize=30)
plt.show()

### Checking Multicollinearity:

1. Try going on with all features.
2. Drop some features based on Multicollinearity and test the metrics

Drop?:
- sqft_above
- sqft_living15
- grade

We decided not to drop given that we would end up with just a few features and would make no sense to continue.

In [None]:
df["date"] = df["date"].apply(lambda x: datetime.toordinal(x))

In [None]:
#Creating a duplicate df to keep the original
housing_new_df = df.copy()

In [None]:
X= housing_new_df.drop(columns= "price")
y= housing_new_df["price"]

In [None]:
X1= df1.drop(columns= "price")
y1= df1["price"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=42)

In [None]:
print(f'100% of our data: {len(housing_new_df)}.')
print(f'70% for training data: {len(X_train)}.')
print(f'30% for test data: {len(X_test)}.')

In [None]:
print(f'100% of our data: {len(df1)}.')
print(f'70% for training data: {len(X1_train)}.')
print(f'30% for test data: {len(X1_test)}.')

## Normalization, Standardization

In [None]:
# normalization had no effect

"""
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
"""

In [None]:
# standardization had no effect

"""
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
"""

## Testing different Models

1. Model with all features
2. Model with reduced number of features

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
#linear regression

model= LinearRegression()
model.fit(X_train, y_train)

In [None]:
#linear regression

model1= LinearRegression()
model1.fit(X1_train, y1_train)

In [None]:
predictions= model.predict(X_test)
predictions

In [None]:
predictions1= model1.predict(X1_test)
predictions1

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
r2_linear = r2_score(y_test, predictions)
RMSE_linear = mean_squared_error(y_test, predictions, squared=False)
MSE_linear = mean_squared_error(y_test, predictions)
MAE_linear = mean_absolute_error(y_test, predictions)
print("R2 = ", round(r2_linear, 4))
print("RMSE = ", round(RMSE_linear, 4))
print("MSE =  ", round(MSE_linear, 4))
print("MAE = ", round(MAE_linear, 4))

In [None]:
r2_linear_1 = r2_score(y1_test, predictions1)
RMSE_linear_1 = mean_squared_error(y1_test, predictions1, squared=False)
MSE_linear_1 = mean_squared_error(y1_test, predictions1)
MAE_linear_1 = mean_absolute_error(y1_test, predictions1)
print("R2 = ", round(r2_linear_1, 4))
print("RMSE = ", round(RMSE_linear_1, 4))
print("MSE =  ", round(MSE_linear_1, 4))
print("MAE = ", round(MAE_linear_1, 4))

In [None]:
#ridge regression
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)

In [None]:
#ridge regression
from sklearn.linear_model import Ridge
ridge1 = Ridge()
ridge1.fit(X1_train, y1_train)

In [None]:
predictions_ridge = ridge.predict(X_test)

In [None]:
predictions_ridge1 = ridge1.predict(X1_test)

In [None]:
r2_ridge = r2_score(y_test, predictions_ridge)
RMSE_ridge = mean_squared_error(y_test, predictions_ridge, squared=False)
MSE_ridge = mean_squared_error(y_test, predictions_ridge)
MAE_ridge = mean_absolute_error(y_test, predictions_ridge)
print("R2 = ", round(r2_ridge, 4))
print("RMSE = ", round(RMSE_ridge, 4))
print("MSE = ", round(MSE_ridge, 4))
print("MAE = ", round(MAE_ridge, 4))

In [None]:
r2_ridge_1 = r2_score(y1_test, predictions_ridge1)
RMSE_ridge_1 = mean_squared_error(y1_test, predictions_ridge1, squared=False)
MSE_ridge_1 = mean_squared_error(y1_test, predictions_ridge1)
MAE_ridge_1 = mean_absolute_error(y1_test, predictions_ridge1)
print("R2 = ", round(r2_ridge_1, 4))
print("RMSE = ", round(RMSE_ridge_1, 4))
print("MSE = ", round(MSE_ridge_1, 4))
print("MAE = ", round(MAE_ridge_1, 4))

In [None]:
#Lasso regression
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(X_train, y_train)

In [None]:
#Lasso regression
from sklearn.linear_model import Lasso
lasso1 = Lasso()
lasso1.fit(X1_train, y1_train)

In [None]:
predictions_lasso = lasso.predict(X_test)

In [None]:
predictions_lasso1 = lasso1.predict(X1_test)

In [None]:
r2_lasso = r2_score(y_test, predictions_lasso)
RMSE_lasso = mean_squared_error(y_test, predictions_lasso, squared=False)
MSE_lasso = mean_squared_error(y_test, predictions_lasso)
MAE_lasso = mean_absolute_error(y_test, predictions_lasso)
print("R2 = ", round(r2_lasso, 4))
print("RMSE = ", round(RMSE_lasso, 4))
print("MSE = ", round(MSE_lasso, 4))
print("MAE = ", round(MAE_lasso, 4))

In [None]:
r2_lasso_1 = r2_score(y1_test, predictions_lasso1)
RMSE_lasso_1 = mean_squared_error(y1_test, predictions_lasso1, squared=False)
MSE_lasso_1 = mean_squared_error(y1_test, predictions_lasso1)
MAE_lasso_1 = mean_absolute_error(y1_test, predictions_lasso1)
print("R2 = ", round(r2_lasso_1, 4))
print("RMSE = ", round(RMSE_lasso_1, 4))
print("MSE = ", round(MSE_lasso_1, 4))
print("MAE = ", round(MAE_lasso_1, 4))

In [None]:
#decision tree regression
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)

In [None]:
#decision tree regression
from sklearn.tree import DecisionTreeRegressor
tree1 = DecisionTreeRegressor()
tree1.fit(X1_train, y1_train)

In [None]:
predictions_tree = tree.predict(X_test)

In [None]:
predictions_tree1 = tree1.predict(X1_test)

In [None]:
r2_tree = r2_score(y_test, predictions_tree)
RMSE_tree = mean_squared_error(y_test, predictions_tree, squared=False)
MSE_tree = mean_squared_error(y_test, predictions_tree)
MAE_tree = mean_absolute_error(y_test, predictions_tree)
print("R2 = ", round(r2_tree, 4))
print("RMSE = ", round(RMSE_tree, 4))
print("MSE = ", round(MSE_tree, 4))
print("MAE = ", round(MAE_tree, 4))

In [None]:
r2_tree_1 = r2_score(y1_test, predictions_tree1)
RMSE_tree_1 = mean_squared_error(y1_test, predictions_tree1, squared=False)
MSE_tree_1 = mean_squared_error(y1_test, predictions_tree1)
MAE_tree_1 = mean_absolute_error(y1_test, predictions_tree1)
print("R2 = ", round(r2_tree_1, 4))
print("RMSE = ", round(RMSE_tree_1, 4))
print("MSE = ", round(MSE_tree_1, 4))
print("MAE = ", round(MAE_tree_1, 4))

In [None]:
#KNN regression
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [None]:
#KNN regression
from sklearn.neighbors import KNeighborsRegressor
knn1 = KNeighborsRegressor()
knn1.fit(X1_train, y1_train)

In [None]:
predictions_knn = knn.predict(X_test)

In [None]:
predictions_knn1 = knn1.predict(X1_test)

In [None]:
r2_knn = r2_score(y_test, predictions_knn)
RMSE_knn = mean_squared_error(y_test, predictions_knn, squared=False)
MSE_knn = mean_squared_error(y_test, predictions_knn)
MAE_knn = mean_absolute_error(y_test, predictions_knn)
print("R2 = ", round(r2_knn, 4))
print("RMSE = ", round(RMSE_knn, 4))
print("MSE = ", round(MSE_knn, 4))
print("MAE = ", round(MAE_knn, 4))

In [None]:
r2_knn_1 = r2_score(y1_test, predictions_knn1)
RMSE_knn_1 = mean_squared_error(y1_test, predictions_knn1, squared=False)
MSE_knn_1 = mean_squared_error(y1_test, predictions_knn1)
MAE_knn_1 = mean_absolute_error(y1_test, predictions_knn1)
print("R2 = ", round(r2_knn_1, 4))
print("RMSE = ", round(RMSE_knn_1, 4))
print("MSE = ", round(MSE_knn_1, 4))
print("MAE = ", round(MAE_knn_1, 4))

In [None]:
#xgboost regression
import xgboost as xgb
xgbr = xgb.XGBRFRegressor()
xgbr.fit(X_train, y_train)

In [None]:
#xgboost regression
import xgboost as xgb
xgbr1 = xgb.XGBRFRegressor()
xgbr1.fit(X1_train, y1_train)

In [None]:
predictions_xgb = xgbr.predict(X_test)

In [None]:
predictions_xgb1 = xgbr1.predict(X1_test)

In [None]:
r2_boost = r2_score(y_test, predictions_xgb)
RMSE_boost = mean_squared_error(y_test, predictions_xgb, squared=False)
MSE_boost = mean_squared_error(y_test, predictions_xgb)
MAE_boost = mean_absolute_error(y_test, predictions_xgb)
print("R2 = ", round(r2_boost, 4))
print("RMSE = ", round(RMSE_boost, 4))
print("MSE = ", round(MSE_boost, 4))
print("MAE = ", round(MAE_boost, 4))

In [None]:
r2_boost_1 = r2_score(y1_test, predictions_xgb1)
RMSE_boost_1 = mean_squared_error(y1_test, predictions_xgb1, squared=False)
MSE_boost_1 = mean_squared_error(y1_test, predictions_xgb1)
MAE_boost_1 = mean_absolute_error(y1_test, predictions_xgb1)
print("R2 = ", round(r2_boost_1, 4))
print("RMSE = ", round(RMSE_boost_1, 4))
print("MSE= ", round(MSE_boost_1, 4))
print("MAE = ", round(MAE_boost_1, 4))

## Linear Model Tuning

Dealing with multicollinearity and outliers within the Linear Regression Model

In [None]:
df2 = df.copy()

In [None]:
df2.columns

In [None]:
df2.drop(columns=["sqft_living","sqft_above","id", "date","sqft_lot15","yr_renovated","waterfront","view"],inplace=True)

In [None]:
#Dealing with outliers

def tukeys_test_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Dropping the outliers
    data = data[(data > lower_bound) & (data < upper_bound)]
    
    return data

In [None]:
df2 = tukeys_test_outliers(df2)

In [None]:
df2.dropna(inplace=True)

In [None]:
X2= df2.drop(columns= "price")
y2= df2["price"]

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
#linear regression

model_2= LinearRegression()
model_2.fit(X2_train, y2_train)

In [None]:
predictions2= model_2.predict(X2_test)
predictions2

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
r2_linear2 = r2_score(y2_test, predictions2)
RMSE_linear2 = mean_squared_error(y2_test, predictions2, squared=False)
MSE_linear2 = mean_squared_error(y2_test, predictions2)
MAE_linear2 = mean_absolute_error(y2_test, predictions2)
print("R2 = ", round(r2_linear2, 4))
print("RMSE = ", round(RMSE_linear2, 4))
print("MSE =  ", round(MSE_linear2, 4))
print("MAE = ", round(MAE_linear2, 4))

## Gridsearch for XGBoost Tuning

In [None]:
# Gridsearch for XGBoost Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42)
xgbr.fit(X_train, y_train)

In [None]:
param_grid = {
    "n_estimators": [50, 100, 200, 300],
    "max_depth": [3, 5, 6, 7],
    "learning_rate": [0.01, 0.1, 0.2, 0.3],
    "colsample_bytree": [0.3, 0.6, 0.9, 1],
    "gamma": [0, 0.1, 0.2, 1, 2, 3]
}

In [None]:
grid_search = GridSearchCV(estimator=xgbr, param_grid=param_grid, scoring="r2", n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(grid_search.cv_results_)
results

In [None]:
grid_search.best_params_

In [None]:
xgbr_best = xgb.XGBRegressor(colsample_bytree=0.3, gamma=0, learning_rate=0.2, max_depth=5, n_estimators=300, random_state=42)
xgbr_best.fit(X_train, y_train)

predictions_xgb_best = xgbr_best.predict(X_test)

r2_boost_best = r2_score(y_test, predictions_xgb_best)
RMSE_boost_best = mean_squared_error(y_test, predictions_xgb_best, squared=False)
MSE_boost_best = mean_squared_error(y_test, predictions_xgb_best)
MAE_boost_best = mean_absolute_error(y_test, predictions_xgb_best)
print("R2 = ", round(r2_boost_best, 4))
print("RMSE = ", round(RMSE_boost_best, 4))
print("MSE = ", round(MSE_boost_best, 4))
print("MAE = ", round(MAE_boost_best, 4))

In [None]:
#final comparision
metrics = {
    'Model': ['Linear Regression','Linear Regression_clean', 'Linear Regression_tuning', 'Ridge', 'Ridge_clean', 'Lasso', 'Lasso_clean', 'Decision Tree', 'Decision Tree_clean', 'KNN', 'KNN_clean', 'XGBoost', 'XGBoost_clean', 'XGBoost_tuned'],
    'R²': [r2_linear, r2_linear_1, r2_linear2, r2_ridge, r2_ridge_1, r2_lasso, r2_lasso_1, r2_tree, r2_tree_1, r2_knn, r2_knn_1, r2_boost, r2_boost_1, r2_boost_best],
    'RMSE': [RMSE_linear, RMSE_linear_1, RMSE_linear2, RMSE_ridge, RMSE_ridge_1, RMSE_lasso, RMSE_lasso_1, RMSE_tree, RMSE_tree_1, RMSE_knn, RMSE_knn_1, RMSE_boost, RMSE_boost_1, RMSE_boost_best],
    'MSE': [MSE_linear, MSE_linear_1, MSE_linear2, MSE_ridge, MSE_ridge_1, MSE_lasso, MSE_lasso_1, MSE_tree, MSE_tree_1, MSE_knn, MSE_knn_1, MSE_boost, MSE_boost_1, MSE_boost_best],
    'MAE': [MAE_linear, MAE_linear_1, MAE_linear2, MAE_ridge, MAE_ridge_1, MAE_lasso, MAE_lasso_1, MAE_tree, MAE_tree_1, MAE_knn, MAE_knn_1, MAE_boost, MAE_boost_1, MAE_boost_best]
}

In [None]:
r2_linear2 = r2_score(y2_test, predictions2)
RMSE_linear2 = mean_squared_error(y2_test, predictions2, squared=False)
MSE_linear2 = mean_squared_error(y2_test, predictions2)
MAE_linear2 = mean_absolute_error(y2_test, predictions2)

In [None]:
df_metrics = pd.DataFrame(metrics)
df_metrics.set_index("Model", inplace=True)
df_metrics.round(2)