In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score
from sklearn import set_config

# Output dataframes instead of arrays
set_config(transform_output="pandas")

In [None]:
sacramento = pd.read_csv("dataset/sacramento.csv")
sacramento.head()

# SET ZIP AS CATEGORICAL

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,1005 MORENO WAY,SACRAMENTO,95838,CA,3,2,1410,Residential,Fri May 16 00:00:00 EDT 2008,180000,38.646206,-121.442767
1,10105 MONTE VALLO CT,SACRAMENTO,95827,CA,4,2,1578,Residential,Fri May 16 00:00:00 EDT 2008,190000,38.573917,-121.316916
2,10133 NEBBIOLO CT,ELK GROVE,95624,CA,4,3,2096,Residential,Fri May 16 00:00:00 EDT 2008,289000,38.391085,-121.347231
3,10165 LOFTON WAY,ELK GROVE,95757,CA,3,2,1540,Residential,Fri May 16 00:00:00 EDT 2008,266510,38.387708,-121.436522
4,10254 JULIANA WAY,SACRAMENTO,95827,CA,4,2,2484,Residential,Fri May 16 00:00:00 EDT 2008,331200,38.56803,-121.309966


### Training, evaluating, and tuning the model

In [7]:
# Split the sacramento dataset into 75% training data and 25% test data
sacramento_train, sacramento_test = train_test_split(
    sacramento, train_size=0.75, random_state=42
)

In [None]:
# fit the linear regression model
mlm = LinearRegression()
mlm.fit(
   sacramento_train[["sq__ft", "beds", "zip"]],  # Two predictors: square footage and number of bedrooms
   sacramento_train["price"]  # A series (house prices)
)

# Make a dataframe containing b_1 (slope) and b_0 (intercept) coefficients
pd.DataFrame({"slope1": [mlm.coef_[0]],"slope2": [mlm.coef_[1]],"slope3": [mlm.coef_[2]], "intercept": [mlm.intercept_]})

   # lm.coef_[0] gives the slope b_1 (change in price per unit change in square footage)
   # lm.intercept_ gives the intercept b_0 (the predicted price when square footage is 0)

Unnamed: 0,slope1,slope2,slope3,intercept
0,165.348136,-31522.287667,-243.377595,23375080.0


In [17]:
mlm.coef_

array([   165.34813614, -31522.2876674 ,   -243.37759481])

In [18]:
mlm.intercept_

23375076.64947027

In [20]:
# make predictions
sacramento_test["predicted"] = mlm.predict(sacramento_test[["sq__ft", "beds", "zip"]])

# Calculate RMSPE for the multivariable model.
lm_mult_test_RMSPE = mean_squared_error(
    y_true=sacramento_test["price"],
    y_pred=sacramento_test["predicted"]
)**(1/2)

lm_mult_test_RMSPE

74375.60010949426

In [21]:
# Calculate R² 
lm_mult_test_r2 = r2_score( 
y_true=sacramento_test["price"], y_pred=sacramento_test["predicted"] 
)

lm_mult_test_r2

0.49342548094008054

### Cross-validation

In [26]:
returned_dictionary_mlm = cross_validate(                          # I DID NOT UNDERSTAND CROSS VALIDATION. multiple intercepts and slopes??
    estimator=mlm,                                              # IS lm FIT ALREADY WITH TRAINING DATA? b0 and b1 FIXED ALREADY?
    cv=5,    # setting up the cross validation number
    X= sacramento[["sq__ft", "beds", "zip"]],
    y= sacramento["price"],
    scoring="neg_root_mean_squared_error" #or scoring="r2"
)

cv_5_df_mlm = pd.DataFrame(returned_dictionary_mlm)    # Converting it to pandas DataFrame
#cv_5_df_mlm["test_score"] = cv_5_df_mlm["test_score"].abs()
cv_5_df_mlm

# - Ensures stability—If slope and intercept vary significantly between folds, that’s a sign the model might be unstable.
# After tuning, you train one final model on the complete training set, which gives you a single intercept and slope??


#scoring method as neg_root_mean_squared_error


Unnamed: 0,fit_time,score_time,test_score
0,0.004997,0.003002,-77141.734615
1,0.004999,0.004003,-91866.310125
2,0.003,0.002999,-60896.04787
3,0.010001,0.003001,-89750.122434
4,0.004999,0.006999,-74904.642395


In [27]:
cv_5_df_mlm["test_score"] = cv_5_df_mlm["test_score"].abs()
cv_5_df_mlm

Unnamed: 0,fit_time,score_time,test_score
0,0.004997,0.003002,77141.734615
1,0.004999,0.004003,91866.310125
2,0.003,0.002999,60896.04787
3,0.010001,0.003001,89750.122434
4,0.004999,0.006999,74904.642395


In [28]:
#aggregate to obtain the mean and standard error across all 5 folds
cv_5_metrics_mlm = cv_5_df_mlm.agg(["mean","sem"])
cv_5_metrics_mlm


Unnamed: 0,fit_time,score_time,test_score
mean,0.005599,0.004001,78911.771488
sem,0.001167,0.000774,5608.236506


These steps could also be repeated for r2, which is our other main metric for model evaluation.

In [29]:
#scoring method as r2
returned_dictionary_mlm2 = cross_validate(
    estimator=mlm,
    cv=5,    # setting up the cross validation number
    X=sacramento[["sq__ft", "beds", "zip"]],
    y=sacramento["price"],
    scoring="r2" 
)

cv_5_df_mlm2 = pd.DataFrame(returned_dictionary_mlm2)    # Converting it to pandas DataFrame

cv_5_df_mlm2

#aggregate to obtain the mean and standard error across all 5 folds
cv_5_metrics_mlm2 = cv_5_df_mlm2.agg(["mean","sem"])
cv_5_metrics_mlm2

Unnamed: 0,fit_time,score_time,test_score
mean,0.005201,0.0036,0.538714
sem,0.000735,0.0004,0.017117


In [31]:
# WHAT IF WE EVALUATE IT IN THE WHOLE SET?
# make predictions
sacramento["predicted"] = mlm.predict(sacramento[["sq__ft", "beds", "zip"]])

# calculate RMSPE_whole_set
RMSPE_whole_set = mean_squared_error(
    y_true=sacramento["price"],
    y_pred=sacramento["predicted"]
)**(1/2)

RMSPE_whole_set

78888.39853655756

In [32]:
# WHAT IF WE EVALUATE IT IN THE WHOLE SET?
# Calculate R²_whole_set 
r2_whole_set = r2_score( 
y_true=sacramento["price"], y_pred=sacramento["predicted"] 
)

r2_whole_set

0.5646299275124241