<a href="https://www.kaggle.com/code/danuherath/house-prices-regression?scriptVersionId=187520555" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<h1 align="center"> Iowa House Prices Prediction (Regression) </h1>

<img 
    src="https://storage.googleapis.com/kaggle-media/competitions/kaggle/5407/media/housesbanner.png"
    alt="" 
    width="500" 
    height="400" 
    style="display: block; margin: 0 auto; border-radius:15px" 
/>

---

## Problem Definition

- Dataset

    - [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/competitions/home-data-for-ml-course/data) dataset from Kaggle which contains 79 features explaining "(almost) every aspect of residential homes in Ames, Iowa". Train dataset contains 1,460 samples and each instance represents one house.

<br>

- Objective
    - The goal of this project is to predict the sales price for each house based on the above features.

<br>

- Following regression algorithms are used to train models. The models are evaluated using the [mean_absolute_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html).

    1. [LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)
    2. [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso)
    3. [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge)
    4. [ElasticNet](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet)
    5. [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
    6. [XGBRegressor](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor)
    7. [BaggingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)
    8. [StackingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingRegressor.html)
    9. [VotingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html)
    
<br>



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import optuna

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')


In [None]:
train_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')


In [None]:
train_data.head()


In [None]:
test_data.head()


---

# Step 1: Exploratory Data Analysis (EDA)

---

In [None]:
train_data.shape
# test_data.shape


In [None]:
train_data.info()


In [None]:
missing_values = train_data.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
missing_values
# missing_columns = missing_values.index
# missing_columns


In [None]:
missing_values = test_data.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
missing_values
# missing_columns = missing_values.index
# missing_columns


In [None]:
# train_data.nunique()


In [None]:
train_data.drop('Id', axis=1).duplicated().sum()


In [None]:
cat_features = train_data.select_dtypes(include=['object']).columns.tolist()
num_features = train_data.drop(['Id', 'SalePrice'], axis=1).select_dtypes(include=[np.number]).columns.tolist()

print(f"Categorical columns:\n{cat_features}")
print(f"\nNumerical columns:\n{num_features}")


## 1.1 Statistical Analysis

---

In [None]:
train_data[num_features].describe().style.set_table_attributes(
    'style="overflow-x: auto; display: inline-block;"').set_properties(**{'min-width': '100px'})


In [None]:
Q1 = train_data[num_features].quantile(0.25)
Q3 = train_data[num_features].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_iqr = ((train_data[num_features] < lower_bound) | (train_data[num_features] > upper_bound))

outliers_count = outliers_iqr.sum()
outliers_count = outliers_count[outliers_count > 0].sort_values(ascending=False)
print(f"Outliers Count: \n{outliers_count}")


In [None]:
train_data['SalePrice'].describe()


## 1.2 Data Visualization

---

In [None]:
# len(cat_features)


In [None]:
n_rows = 11
n_cols = 4
palette = sns.color_palette("crest") 

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))

for i, column in enumerate(train_data[cat_features].columns):
    row = i // n_cols
    col = i % n_cols
    ax = axes[row, col]
    
    category_counts = train_data[cat_features][column].value_counts()
    sns.barplot(x=category_counts.index, y=category_counts.values, ax=ax, palette=palette)
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=90)

for j in range(i + 1, n_rows * n_cols):
    fig.delaxes(axes.flatten()[j])
    
plt.suptitle('Bar Charts - House Prices Features', fontsize=20, y=1.0)
plt.tight_layout(pad=4.0)
plt.savefig('Bar Charts - House Prices Features.png')
plt.show()


In [None]:
# len(num_features)


In [None]:
n_cols = 2
n_rows = 18

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4*n_rows))

for i, column in enumerate(train_data[num_features].columns):
    row = i // n_cols
    col = i % n_cols
    ax = axes[row, col]
    
    sns.histplot(train_data[num_features][column], ax=ax, color='seagreen')
    ax.set_ylabel('Frequency')

plt.suptitle('Histograms - House Prices Features', fontsize=20, y=1.0)
plt.tight_layout(pad=4.0)
plt.savefig('Histograms - House Prices Features.png')
plt.show()


In [None]:
corr = train_data[num_features].corr().round(2)
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.light_palette("darkgreen", as_cmap=True)

plt.figure(figsize=(25,20))
sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, linewidths=0.2)

plt.title('Heatmap - House Prices Dataset Feature Correlations', fontsize=20)

plt.savefig('House Prices Dataset Feature Correlations.png')
plt.show()


### Conclusion:

* No duplicated rows are prsent
* Several features have null values
* "PoolQC, MiscFeature, Alley, Fence" features have more than 50% missing values
* Several features have outliers
* 36 features contain numerical data
* 43 features contain categorical data


> ---

# Step 2. Data Cleaning & Feature Engineering¶

---

In [None]:
train_data.drop("Id", axis=1, inplace=True)
test_data.drop("Id", axis=1, inplace=True)


---

# Step 3. Data Pre-processing¶

---

In [None]:
numerical_cols = num_features.copy()
categorical_cols = cat_features.copy()


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ]), numerical_cols),
        ('categorical', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), categorical_cols)
    ],
    remainder='passthrough'
)

preprocessor


In [None]:
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']


---

# Step 4. Model Training & Evaluation¶

---

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(random_state=42),
    'Bagging Regressor': BaggingRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Extreme Gradient Boosting': XGBRegressor(random_state=42),
}


In [None]:

def train_models(X, y, models, preprocessor):
    results = {}
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    for name, model in models.items():

        pipeline = make_pipeline(preprocessor, model)

        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict(X_val)

        mae = mean_absolute_error(y_val, y_pred).round(1)
        
        print(f'Mean Absolute Error - {name}:\n{mae}')

        results[name] = mae

    return results


results = train_models(X, y, models, preprocessor)
# results

## 4.1 Hyperparameter Tuning

---

In [None]:
# # Random Forest

# def objective(trial, X_train, y_train):
    
#     n_estimators = trial.suggest_int('n_estimators', 50, 200)
#     max_depth = trial.suggest_int('max_depth', 5, 30)
#     min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
#     min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
#     max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    
#     model = RandomForestRegressor(
#         n_estimators=n_estimators,
#         max_depth=max_depth,
#         min_samples_split=min_samples_split,
#         min_samples_leaf=min_samples_leaf,
#         max_features=max_features,
#         random_state=42
#     )
    
#     pipeline = make_pipeline(preprocessor, model)
    
#     score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
    
#     return -score  



# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# study = optuna.create_study(direction='minimize')
# study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=50)

# print("Best hyperparameters: ", study.best_params)

# best_rf = RandomForestRegressor(
#     **study.best_params,
#     random_state=42
# )

# pipeline = make_pipeline(preprocessor, best_rf)

# pipeline.fit(X_train, y_train)

# y_pred = pipeline.predict(X_val)

# val_score = mean_absolute_error(y_val, y_pred).round(1)
# print("Validation Set MAE:", val_score)


In [None]:
# best_rf


In [None]:
# XGBoost

def objective(trial, X_train, y_train):

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 20),
        'subsample': trial.suggest_loguniform('subsample', 0.05, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.05, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'random_state': 42
    }
    
    model = XGBRegressor(**params)
    
    pipeline = make_pipeline(preprocessor, model)
    
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
    
    return -score  



X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=150)

print("Best hyperparameters: ", study.best_params)

best_xgb = XGBRegressor(
    **study.best_params,
    device='gpu'
)

pipeline = make_pipeline(preprocessor, best_xgb)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)

val_score = mean_absolute_error(y_val, y_pred).round(1)
print("Validation Set MAE:", val_score)


In [None]:
best_xgb


## 4.2 Stacking and Voting

---

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# base_learners = [
#     ('Random Forest', best_rf),
#     ('Extreme Gradient Boosting', best_xgb)
# ]

# stacking_regressor = StackingRegressor(
#     estimators=base_learners,
#     final_estimator=Ridge()
# )

# pipeline = make_pipeline(preprocessor, stacking_regressor)

# pipeline.fit(X_train, y_train)

# y_pred = pipeline.predict(X_val)

# mae = mean_absolute_error(y_val, y_pred).round(1)
# print(f"Mean Absolute Error: {mae}")

# stacking_regressor


In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# base_learners = [
#     ('Random Forest', best_rf),
#     ('Extreme Gradient Boosting', best_xgb)
# ]

# voting_regressor = VotingRegressor(estimators=base_learners)

# pipeline = make_pipeline(preprocessor, voting_regressor)

# pipeline.fit(X_train, y_train)

# y_pred = pipeline.predict(X_val)

# mae = mean_absolute_error(y_val, y_pred).round(1)
# print(f"Mean Absolute Error: {mae}")

# voting_regressor


---

# Step 5. Model Prediction on Test Data¶

---

In [None]:
# pipeline = make_pipeline(preprocessor, best_rf)
# pipeline = make_pipeline(preprocessor, stacking_regressor)
# pipeline = make_pipeline(preprocessor, voting_regressor)
pipeline = make_pipeline(preprocessor, best_xgb)


test_predictions = pipeline.predict(test_data)


In [None]:
submission = pd.read_csv('/kaggle/input/home-data-for-ml-course/sample_submission.csv')
submission['SalePrice'] = test_predictions
submission.to_csv('submission.csv', index=False)

submission.head()
