# **HOUSE PRICES PREDICTION | EDA - PP - MODELING**

This notebook shows how I built a house price prediction model using Random Forest from scikit-learn.

## **IMPORT LIBRARIES**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree
import warnings
warnings.filterwarnings('ignore')

## **LOAD DATASET**

In [None]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

## **EDA**

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
train.describe()

## **PREPROCESSING**

> ### **Missing Data**

In [None]:
print("NUMBER OF NUMERICAL COLUMNS")
numbers_columns = train.select_dtypes(include=np.number).columns.tolist()
print(f"Number: {len(numbers_columns)}, Numerical Columns: {numbers_columns}\n\n")

print("NUMBER OF CATEGORIAL COLUMNS")
numbers_categorial = train.select_dtypes(include=['object']).columns.tolist()
print(f"Number: {len(numbers_categorial)}, Categorial Columns: {numbers_categorial}")

In [None]:
drop_train = train.drop(['Id', 'SalePrice'], axis=1)
test_drop = test.drop(['Id'], axis=1) 

In [None]:
combine_data = pd.concat([drop_train, test_drop], axis=0).reset_index(drop=True)
missing = combine_data.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(missing)

In [None]:
# Type 1: Missing data with reason 
miss_cols_reason = ['Alley','PoolQC', 'MiscFeature', 'Fence','FireplaceQu','GarageType','GarageFinish','GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'MasVnrType',]
for col in miss_cols_reason:
    combine_data[col] = combine_data[col].fillna('None')
    combine_data['MasVnrArea'] = combine_data['MasVnrArea'].fillna(0)

In [None]:
# Type 2: Special Cases
# Case 1: LotFrontage
combine_data['LotFrontage'] = combine_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
# Case 2: GarageYrBlt
combine_data['GarageYrBlt'] = combine_data['GarageYrBlt'].fillna(-1)


In [None]:
# Type 3: Truly Random Missing Data
miss_cat = combine_data.select_dtypes(include=['object']).columns
for col in miss_cat:
    combine_data[col] = combine_data[col].fillna(combine_data[col].mode()[0])
miss_num = combine_data.select_dtypes(exclude=['object']).columns
for col in miss_num:
    combine_data[col] = combine_data[col].fillna(combine_data[col].median())

In [None]:
combine_data.isnull().sum()

> ### **Ouliners**

In [None]:
numerical_cols = combine_data.select_dtypes(include=np.number).columns.tolist()
combine_data_capped = combine_data.copy()
for col in numerical_cols:
    Q1 = combine_data[col].quantile(0.25)
    Q3 = combine_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    combine_data_capped[col] = combine_data[col].clip(lower = lower_bound, upper = upper_bound)

outlier_counts = {}
for col in numerical_cols:
    Q1_new = combine_data_capped[col].quantile(0.25)
    Q3_new = combine_data_capped[col].quantile(0.75)
    IQR_new = Q3_new - Q1_new
    lower_new = Q1_new - 1.5 * IQR_new
    upper_new = Q3_new + 1.5 * IQR_new
    
    outliers = combine_data_capped[
        (combine_data_capped[col] < lower_new) | 
        (combine_data_capped[col] > upper_new)
    ]
    outlier_counts[col] = len(outliers)

outlier_df = pd.DataFrame(list(outlier_counts.items()), columns=['Feature', 'Outliers_After_Capping'])
outlier_df = outlier_df[outlier_df['Outliers_After_Capping'] > 0]

print("Outlier counts after capping:")
print(outlier_df if not outlier_df.empty else "No outliers detected after capping.")

## **FEATURE ENGINEERING**

In [None]:
length_train = len(train)
X_train = combine_data_capped[:length_train].copy()
X_test = combine_data_capped[length_train:].copy()
y_train = train['SalePrice'].reset_index(drop=True)

In [None]:
# Feature Engineering: Creating TotalSF feature
X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['1stFlrSF'] + X_train['2ndFlrSF']
X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test['2ndFlrSF']

# Creating TotalBathrooms feature
X_train['TotalBathrooms'] = (X_train['FullBath'] + (0.5 * X_train['HalfBath']) +
                             X_train['BsmtFullBath'] + (0.5 * X_train['BsmtHalfBath']))
X_test['TotalBathrooms'] = (X_test['FullBath'] + (0.5 * X_test['HalfBath']) +
                            X_test['BsmtFullBath'] + (0.5 * X_test['BsmtHalfBath']))
# House Age
X_train['HouseAge'] = X_train['YrSold'] - X_train['YearBuilt']
X_test['HouseAge'] = X_test['YrSold'] - X_test['YearBuilt']

# Quality x Size Features
X_train['OverallQual_SqFt'] = X_train['OverallQual'] * X_train['TotalSF']
X_test['OverallQual_SqFt'] = X_test['OverallQual'] * X_test['TotalSF']


In [None]:
full_train = X_train.copy()
full_train['SalePrice'] = y_train

##  **ENCODE**

In [None]:
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='inner', axis=1, fill_value=0)

## **MODELING**

> ### **Model Validation and Evaluation**

In [None]:
X_temp, X_val, y_temp, y_val = train_test_split(
    X_train_encoded, y_train, test_size=0.2, random_state=42
)
val_model = RandomForestRegressor(n_estimators=100, 
                                  max_depth=20,
                                  min_samples_split=5,
                                  min_samples_leaf=2,
                                  random_state=42,
                                  n_jobs=-1
                                )
val_model.fit(X_temp, y_temp)
val_pred = val_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_pred))
r2 = r2_score(y_val, val_pred)
print(f"Validation RMSE: {rmse:.2f}")
print(f"Validation RÂ²: {r2:.4f}")


> ### **Validation Visualization** 

In [None]:
plt.figure(figsize=(12, 5))

# True vs Predicted
plt.subplot(1, 2, 1)
sns.scatterplot(x=y_val, y=val_pred, alpha=0.6)
plt.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')
plt.xlabel('True SalePrice'); plt.ylabel('Predicted'); plt.title('Validation: True vs Pred')

# Residuals
plt.subplot(1, 2, 2)
residuals = y_val - val_pred
sns.scatterplot(x=val_pred, y=residuals, alpha=0.6)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted'); plt.ylabel('Residuals'); plt.title('Residuals Plot')
plt.tight_layout()
plt.show()

> ### **Final Training** 

In [None]:
rfr_model = RandomForestRegressor(n_estimators=100, 
                                  max_depth=20,
                                  min_samples_split=5,
                                  min_samples_leaf=2,
                                  random_state=42,
                                  n_jobs=-1
                                )
rfr_model.fit(X_train_encoded, y_train)

> ### **Interpretability**

In [None]:
plt.figure(figsize=(20,8))
plot_tree(rfr_model.estimators_[0],
          feature_names=X_train_encoded.columns,
          filled=True,
          rounded=True,
          max_depth=3,
          fontsize=10)
plt.show()

# SUBMISSION

In [None]:
y_pred_test = rfr_model.predict(X_test_encoded)
submission = pd.DataFrame({'Id': test['Id'], 'SalePrice': y_pred_test})
submission.to_csv('submission.csv', index=False)