Building a House Price Prediction Model

Improt all the libraries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import scipy.stats as stats

SECTION - 1

Load all the csv files dataset

In [None]:
try:
    df = pd.read_csv("train.csv")
    print("Dataset loaded")
except FileNotFoundError:
    print("Error: 'train.csv' not found. Please place the dataset in the correct folder/working directory.")
    exit()

Initial data overview

In [None]:
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns")
print(f"\nFirst 5 rows and {df.shape[1]} columns of the dataset:")
print(df.head())
print("\n---- Dataset information (data types and non-null counts) ----")
df.info()

SECTION - 2

Exploratory Data Analysis (EDA)

In [None]:
# Plotting the original distribution
plt.figure(figsize=(8, 6))
sns.histplot(df["SalePrice"], kde=True)
plt.title("Distribution of SalePrice (Original)")
plt.xlabel("Sale Price ($)")
plt.ylabel("Frequency")
plt.show()
print(f"Skewness of original SalePrice: {df['SalePrice'].skew():.2f}")

Skewness is more, we will apply log to make it symmetric. 
As for low it is not low but for high it is not that high. 
We use log1p not log coz log(0) = -infinity. log1p(x) = log(1+x)  

In [None]:
df["SalePrice_Log"] = np.log1p(df["SalePrice"])

In [None]:
# Plotting the log-transformed distribution
plt.figure(figsize=(8, 6))
sns.histplot(df["SalePrice_Log"], kde=True, color="green")
plt.title("Distribution of SalePrice (Log-Transformed)")
plt.xlabel("Log of Sale Price")
plt.ylabel("Frequency")
plt.show()
print(f"Skewness of log-transformed SalePrice: {df['SalePrice_Log'].skew():.2f}")

Now we will plot loged price with other variables to see how they interact with each other.

In [None]:
# price vs square footage
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df["GrLivArea"], y=df["SalePrice_Log"])
plt.title("GrLivArea vs. Log-Transformed SalePrice")
plt.xlabel("Above Grade Living Area (sq. ft.)")
plt.ylabel("Log of Sale Price")
plt.show()

The plot shows a clear positive linear relationship [as size increase so does the logPrice], which is good for our model.

SECTION - 3

Handling Missing Data

In [None]:
# We check for missing values in our selected predictor columns.
selected_features_for_check = [
    "GrLivArea",
    "BedroomAbvGr",
    "FullBath",
    "HalfBath",
    "BsmtFullBath",
    "BsmtHalfBath",
]
print("\n--- Missing Value Check ---")
print(df[selected_features_for_check].isnull().sum())

No missing data found.

We combine the four bathroom-related columns into a single, more meaningful feature.

In [None]:
df["TotalBathrooms"] = (
    df["FullBath"]
    + 0.5 * df["HalfBath"]
    + df["BsmtFullBath"]
    + 0.5 * df["BsmtHalfBath"]
)
print("\n'TotalBathrooms' feature created successfully.")

Now we have 4 coloums to manage. 

Dataset Partitioning: Train-Test Split

In [None]:
# We define our final set of features (X) and the target variable (y).
features = ["GrLivArea", "BedroomAbvGr", "TotalBathrooms"]
target = "SalePrice_Log"

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"\nData split into training and testing sets:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

Feature Scaling: This is important because our features are on different scales.

Comparing Apples and Oranges. 
GrLivArea (Square Footage) is the History Exam (values in the thousands). 
BedroomAbvGr (Number of Bedrooms) is the Math Exam (values like 2, 3, 4).

In [None]:

scaler = StandardScaler()

scaler.fit(X_train) #It calculates the necessary statistics—the mean and the standard deviation.

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled successfully using StandardScaler.")

SECTION - 4

Training the Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)
print("\nLinear Regression model trained successfully.")

Model Interpretation and Coefficients [Y = m*X + b]

In [None]:
print("Interpretation: It is the baseline log-price.")
print(f"Intercept: {model.intercept_:.4f}")

print("Coefficient: It represents the expected change in the log-price for a one standard deviation increase in that feature, holding others constant.")
coefficients = pd.DataFrame(model.coef_, features, columns=["Coefficient"])
print(coefficients)  

Making prediction

In [None]:
y_pred_log = model.predict(X_test_scaled) # predictions on log scale.
y_pred_dollars = np.expm1(y_pred_log) # un-logging the predicted values.

y_test_dollars = np.expm1(y_test) # the original test values.

print("\nPredictions made on the test set and converted back to dollar amounts.")

SECTION - 5

Model Performance Evaluation

In [None]:
# Evaluation on the log scale (as the model was trained)
r2_log = r2_score(y_test, y_pred_log)
rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))

print(f"R-squared (Log Scale): {r2_log:.4f}")
print(f"R-squared_score: The model's features (square footage, bedrooms, etc.) explains {r2_log*100:.2f}% of the variance in the log-transformed sale prices.")
print(f"\nRMSE (Log Scale): {rmse_log:.4f}")

# Evaluation on the original dollar scale for interpretability
rmse_dollars = np.sqrt(mean_squared_error(y_test_dollars, y_pred_dollars))

print(f"\nRMSE (Dollar Scale): ${rmse_dollars:,.2f}")
print(f"On average, its predictions are off by about ${rmse_dollars:,.0f}")

The difference between actual and predicted values

In [None]:
# We calculate the residuals (the difference between actual and predicted values).
residuals = y_test - y_pred_log

plt.figure(figsize=(14, 6))

# A. Homoscedasticity Check: Residuals vs. Predicted Values
# We look for a random cloud of points around y=0 with no clear pattern.
plt.subplot(1, 2, 1)
sns.scatterplot(x=y_pred_log, y=residuals)
plt.axhline(y=0, color="r", linestyle="--")
plt.title("Residuals vs. Predicted Values")
plt.xlabel("Predicted Log(SalePrice)")
plt.ylabel("Residuals")

# B. Normality of Residuals Check: Q-Q Plot
# We look for points falling along the 45-degree diagonal line.
plt.subplot(1, 2, 2)
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Q-Q Plot of Residuals")

plt.tight_layout()
plt.show()

Diagnostic Checks Summary
1. Homoscedasticity Plot: The points appear randomly scattered around the zero line, suggesting the assumption of constant variance (homoscedasticity) is reasonably met. There is no obvious cone shape."
2. Q-Q Plot: The points fall mostly along the diagonal line, indicating that the residuals are approximately normally distributed, satisfying another key assumption of linear regression.