# House Price Project

In [201]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    root_mean_squared_error,
    mean_absolute_percentage_error,
)
import joblib

In [202]:
df_train = pd.read_csv("training_set.csv")
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [203]:
df_test = pd.read_csv("sample_set.csv")
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [204]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [205]:
m = df_train.isna().sum()
m[m > 0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [206]:
df_train.duplicated().sum()

np.int64(0)

In [207]:
# ID_test is used to identify the test set
# Y is the target variable (SalePrice)
# df.train.drop is used to remove the 'Id' and 'SalePrice' columns from the training set
# axis =1 indicates that we are dropping columns, not rows

# df_test.drop is used to remove the 'Id' column from the test set
ID_test = df_test["Id"]
Y = df_train["SalePrice"]
X = df_train.drop(["Id", "SalePrice"], axis=1)
X_test = df_test.drop("Id", axis=1)

In [208]:
Y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [209]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [210]:
# fillna is used to fill missing values
# X.median() calculates the median of each column
# numeric_only=True ensures that only numeric columns are considered

X = X.fillna(X.median(numeric_only=True))
X_test = X_test.fillna(X_test.median(numeric_only=True))

In [211]:
# Encode categorical variables using one-hot encoding
# pd.get_dummies converts categorical variable(s) into dummy/indicator variables
# One-hot encoding creates a new column for each category and assigns a 1 or 0 to indicate the presence of that category

encoded_train = pd.get_dummies(X)
encoded_test = pd.get_dummies(X_test)

In [212]:
# Align the columns of the training and test sets after one-hot encoding
# This ensures that both datasets have the same features, filling missing columns with 0
# Union is used to combine the columns of both datasets
# reindex is used to align the columns, filling missing columns with 0
# columns=all_features specifies the columns to keep in the DataFrame
# fill_value=0 specifies that missing values should be filled with 0

all_features = encoded_train.columns.union(encoded_test.columns)
aligned_train = encoded_train.reindex(columns=all_features, fill_value=0)
aligned_test = encoded_test.reindex(columns=all_features, fill_value=0)

In [213]:
# scaler is used to standardize the features by removing the mean and scaling to unit variance
# X_scaled is the standardized training set
# X_test_scaled is the standardized test set

scaler = StandardScaler()
X_scaled = scaler.fit_transform(aligned_train)
X_test_scaled = scaler.transform(aligned_test)

In [214]:
# Feature selection using Lasso and Ridge regression
# SelectFromModel is used to select features based on importance weights from Lasso and Ridge models
# prefit=True indicates that the model is already fitted and we are using it to select features

lasso_selector = SelectFromModel(Lasso(alpha=1.0).fit(X_scaled, Y), prefit=True)
X_lasso_sel = lasso_selector.transform(X_scaled)

ridge_selector = SelectFromModel(Ridge(alpha=1.0).fit(X_scaled, Y), prefit=True)
X_ridge_sel = ridge_selector.transform(X_scaled)

In [215]:
# Function to evaluate the model
# Ridge(alpha=1.0).fit(X_lasso_sel, Y) fits Lasso-based selected features
# Ridge(alpha=1.0).fit(X_ridge_sel, Y) fits Ridge-based selected features

evaluate_model(
    Ridge(alpha=1.0).fit(X_lasso_sel, Y), X_lasso_sel, Y, "Lasso-based Selection"
)
evaluate_model(
    Ridge(alpha=1.0).fit(X_ridge_sel, Y), X_ridge_sel, Y, "Ridge-based Selection"
)

Lasso-based Selection Evaluation Metrics:
MAE: 13223.39
RMSE: 20523.63
MAPE: 7.81%
R2: 93.32%

Ridge-based Selection Evaluation Metrics:
MAE: 14527.02
RMSE: 22064.23
MAPE: 8.63%
R2: 92.28%



In [216]:
# r2_vals is a list of R^2 scores for Lasso and Ridge selected features
# np.argmax is used to find the index of the maximum value in r2_vals
# r2_score is used to calculate the R^2 score of the model predictions
# selectors is a list of the two feature selectors
# best_selector is the selector with the highest R^2 score

r2_vals = [
    r2_score(Y, Ridge(alpha=1.0).fit(X_lasso_sel, Y).predict(X_lasso_sel)),
    r2_score(Y, Ridge(alpha=1.0).fit(X_ridge_sel, Y).predict(X_ridge_sel)),
]
selectors = [lasso_selector, ridge_selector]
best_selector = selectors[np.argmax(r2_vals)]
X_selected = best_selector.transform(X_scaled)
X_test_selected = best_selector.transform(X_test_scaled)

In [217]:
# Ridge regression is a linear model that uses L2 regularization
# Fitting in X_selected and Y ensures that the models are trained on the selected features

ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_selected, Y)
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_selected, Y)

In [218]:
def evaluate_model(model, x, y, name):
    y_pred = model.predict(x)
    mae = mean_absolute_error(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    r2 = r2_score(y, y_pred)
    mape = mean_absolute_percentage_error(y, y_pred)
    print(f"{name} Evaluation Metrics:")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAPE: {mape:.2%}")
    print(f"R2: {r2:.2%}\n")

In [219]:
evaluate_model(ridge_model, X_selected, Y, "Ridge")
evaluate_model(lasso_model, X_selected, Y, "Lasso")

Ridge Evaluation Metrics:
MAE: 13223.39
RMSE: 20523.63
MAPE: 7.81%
R2: 93.32%

Lasso Evaluation Metrics:
MAE: 13227.92
RMSE: 20518.70
MAPE: 7.81%
R2: 93.32%



In [222]:
# models is used to store the fitted models
# r2_scores is used to store the R^2 scores of the models on the selected features
# argmax is used to find the index of the model with the highest R^2 score from numpy
# np.argmax returns the index of the first occurrence of the maximum value in the array
# best_model = models[np.argmax(r2_scores)] is used to select the model with the highest R^2 score

models = [ridge_model, lasso_model]
r2_scores = [r2_score(Y, m.predict(X_selected)) for m in models]
best_model = models[np.argmax(r2_scores)]

In [223]:
# Y_pred = best_model.predict(X_selected) shows the predicted values for the training set
# predict is used to make predictions on the selected features
Y_pred = best_model.predict(X_test_selected)

In [224]:
submission = pd.DataFrame({"Id": ID_test, "SalePrice": Y_pred})
submission.to_csv("Predicted_Results.csv", index=False)

In [226]:
joblib.dump(best_model, "best_model.joblib")
joblib.dump(scaler, "scaler.joblib")
joblib.dump(best_selector, "feature_selector.joblib")

['feature_selector.joblib']

In [227]:
b = joblib.load("best_model.joblib")
s = joblib.load("scaler.joblib")
f = joblib.load("feature_selector.joblib")

In [228]:
b

In [229]:
s

In [230]:
f