# Melbourne Housing Market

House Prices Prediction

The data is from Kaggle and can be found [here](https://www.kaggle.com/anthonypino/melbourne-housing-market)

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Define functions

In [2]:
def mean_encode(df: pd.DataFrame, feature: str, target: str):
    """
    df: dataframe with "feature" and "target" columns
    feature: feature to transform
    target: target variable
    """
    new_col_name = feature + "_mean"
    df_enc = (
        # select columns
        df.loc[:, [feature, target]]
        # group by feature
        .groupby(feature)
        # aggregate over feature using target mean
        .agg(tmp_name=(target, np.mean))
        # index (i.e., feature categories) as a column
        .reset_index()
        # rename the column with the aggregated means
        .rename(columns={"tmp_name":new_col_name})
    )

    # merge: add the new column with the aggregated mean from
    # "df_enc" back into "df"
    df_merged = pd.merge(df, df_enc, how="left", on=feature)

    return df_merged


def mean_encode_2(df: pd.DataFrame, feature1: str, feature2: str, target: str):
    """
    Same as "mean_encode" but with 2 features.
    df: dataframe with "feature" and "target" columns
    feature: feature to transform
    target: target variable
    """
    new_col_name = feature1 + feature2 + "_mean"
    df_enc = (
        # select columns
        df.loc[:, [feature1, feature2, target]]
        # group by feature
        .groupby([feature1, feature2])
        # aggregate over feature using target mean
        .agg(tmp_name = (target, np.mean))
        # index (i.e., feature categories) as a column
        .reset_index()
        # rename the column with the aggregated means
        .rename(columns={"tmp_name":new_col_name})
        )

    # merge: add the new column with the aggregated mean from
    # "df_enc" back into "df"
    df_merged = pd.merge(df, df_enc, how="left", on=[feature1, feature2])

    return df_merged


def convert_to_others(df: pd.DataFrame, feature: str, N_counts: int):
    """
    df: data frame
    feature: feature to transform
    N_counts: categories with less than "N_counts" counts are converted to "others" 
    """

    df_count = (
        df
        .groupby(feature)[feature]
        .value_counts()
        .reset_index()
    )

    # Select the categories with less than N_counts
    df_other = df_count.loc[df_count["count"] < N_counts, feature]

    # Name for the new column with some categories converted to "others"
    new_col_name = feature + "_others"

    # Copy original column
    df[new_col_name] = df[feature]
    # Categories with less than "N_counts" counts (this info is #
    # stored in the data frame "df_other") are set to "others"
    df.loc[df[feature].isin(df_other), [new_col_name]] = "others"

    return df

# Load data

In [3]:
# Load the dataset
def load_ds(path: Path, filename: str) -> pd.DataFrame:
    """Read the dataset csv file as a pandas dataframe."""
    return pd.read_csv(path / filename)

# Load dataset
dataset_path = Path().absolute() / "data"
filename = "X_y_train.csv"
X_y_train = load_ds(dataset_path, filename)

print(f"Shape: {X_y_train.shape}")

Shape: (397900, 18)


In [4]:
X_y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397900 entries, 0 to 397899
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Date                       397900 non-null  object 
 1   Store                      397900 non-null  float64
 2   DayOfWeek                  386039 non-null  float64
 3   Sales                      397900 non-null  float64
 4   Customers                  386030 non-null  float64
 5   Open                       385880 non-null  float64
 6   Promo                      386000 non-null  float64
 7   StateHoliday               385848 non-null  object 
 8   SchoolHoliday              385817 non-null  float64
 9   StoreType                  397900 non-null  object 
 10  Assortment                 397900 non-null  object 
 11  CompetitionDistance        396864 non-null  float64
 12  CompetitionOpenSinceMonth  271565 non-null  float64
 13  CompetitionOpenSinceYear   27

# Preprocessing pipeline

In [5]:

# TransformerMixin: add method ".fit_transform()"
# BaseEstimator: add methods ".get_params()" and ".set_params()"
# We need 3 methods:
# 1) .fit()
# 2) .transform()
# 3) .fit_transform() (provided by "TransformerMixin")
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # avoid "*args" or "**kargs" in "__init__"
    def __init__(self):
        pass

    # fit is needed later for the pipilene
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Since I use MEAN ENCODING, "X" must include
        # the terget variable. Below, just before returning
        # the transformed X, the target variable is dropped.

        # Date
        Date_2 = pd.to_datetime(X["Date"], format="%Y-%m-%d")
        X["Month"] = Date_2.dt.month
        X = mean_encode(X, "Month", "Sales")
        # drop: "Date" and "Month"

        # Store
        X = mean_encode(X, "Store", "Sales")
        # drop: "Store"

        # DayOfWeek
        X = mean_encode(X, "DayOfWeek", "Sales")
        # drop: "DayOfWeek"

        # Promo (separately for each Store)
        X = mean_encode_2(X, "Promo", "Store", "Sales")
        # drop: "Promo" and "Store"

        # SchoolHoliday
        X.loc[X.SchoolHoliday=="0", :] = 0.0
        # keep: "SchoolHoliday"

        # StoreType: keep, no transformation

        # Assortment: keep, no transformation

        # Promo2: keep, no transformation

        # CompetitionDistance
        nb = 10 # number of bins
        clip_upper = 10000
        X["CD_clip"] = X["CompetitionDistance"].clip(upper=clip_upper)
        CD_clip_bins = pd.cut(
            X["CD_clip"],
            bins=nb,
            labels=[i for i in range(nb)])
        X['CD_clip_bins'] = pd.to_numeric(CD_clip_bins)
        X["CD_clip_bins_clip"] = X["CD_clip_bins"].clip(upper=clip_upper) # 
        # drop: "CompetitionDistance", "CD_clip", "CD_clip_bins"

        # Drop unused columns
        cols_to_drop = [
            "Date", "Month", "Store", "DayOfWeek", "Customers", "Open", "Promo",
            "StateHoliday", "CompetitionDistance", "CD_clip", "CD_clip_bins",
            "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", "Promo2SinceWeek",
            "Promo2SinceYear", "PromoInterval"]
        X.drop(columns=cols_to_drop, inplace=True)

        # Drop the target
        target_to_drop = ["Sales"]
        X.drop(columns=target_to_drop, inplace=True)

        return X

In [6]:
caa = CombinedAttributesAdder()
new_x = caa.fit_transform(X_y_train)
new_x.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397900 entries, 0 to 397899
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SchoolHoliday      385817 non-null  float64
 1   StoreType          397900 non-null  object 
 2   Assortment         397900 non-null  object 
 3   Promo2             397900 non-null  int64  
 4   Month_mean         397900 non-null  float64
 5   Store_mean         397900 non-null  float64
 6   DayOfWeek_mean     386039 non-null  float64
 7   PromoStore_mean    386000 non-null  float64
 8   CD_clip_bins_clip  396864 non-null  float64
dtypes: float64(6), int64(1), object(2)
memory usage: 27.3+ MB


In [7]:
(new_x.isna().sum() / new_x.shape[0]).reset_index()

Unnamed: 0,index,0
0,SchoolHoliday,0.030367
1,StoreType,0.0
2,Assortment,0.0
3,Promo2,0.0
4,Month_mean,0.0
5,Store_mean,0.0
6,DayOfWeek_mean,0.029809
7,PromoStore_mean,0.029907
8,CD_clip_bins_clip,0.002604


In [8]:
# Numerical pipeline
#
# All (except the last) estimators must be transformers (i.e., they
# must have a ".fit_transform()" method).
num_pipeline = Pipeline([
    # replace NA with mean
    ('imputer', SimpleImputer(strategy='mean')),
    # standardize the variables: z = (x - mean) / SD
    ('std_scaler', StandardScaler())])

In [9]:
# Categorical pipeline
#
# All (except the last) estimators must be transformers (i.e., they
# must have a ".fit_transform()" method).
cat_pipeline = Pipeline([
    # replace NA with mode
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # apply "OneHotEncoder()"
    ('one_hot', OneHotEncoder(drop='if_binary'))])



In [10]:
list_num_attribs = ["SchoolHoliday", "Promo2", "Month_mean", "Store_mean",
                    "DayOfWeek_mean", "PromoStore_mean", "CD_clip_bins_clip"]
list_cat_attribs = ["StoreType", "Assortment"]

In [11]:
# ColumnTransformer requires tuples with:
# - a name
# - a transformer
# - a list of names (or indices) of columns to which the transformer is applied

cols_transformer = ColumnTransformer([
    # apply "num_pipeline" to numerical columns
    ('num', num_pipeline, list_num_attribs),
    # apply "cat_pipeline" to categorical columns
    ('cat', cat_pipeline, list_cat_attribs)])

In [12]:
full_pipeline = Pipeline([
    # transform/add columns
    ('attribs_adder', CombinedAttributesAdder()),
    # Transform numerical and categorical attributes
    ("cols_transformer", cols_transformer)])

# Linear regression

In [13]:
X_train = X_y_train # include "Sales", CombinedAttributesAdder() drops it
y_train = X_y_train.loc[:, "Sales"].copy()

## Linear regression

In [14]:
lm = Pipeline([
    # Pre-processing pipeline
    ("preparation", full_pipeline),
    # Random forest
    ("lm", LinearRegression())])

In [15]:
lm.fit(X_train, y_train)

In [16]:
lm.score(X_train, y_train)

0.7899802538333073

In [17]:
y_pred = lm.predict(X_train)

In [18]:
mean_squared_error(y_train, y_pred, squared=False)

1414.4274616671228

## Cross Validation

In [19]:
lm_2 = Pipeline([
    # Pre-processing pipeline
    ("preparation", full_pipeline),
    # Decision Tree (defaul parameters)
    ("lm", LinearRegression())])

In [20]:
cross_val_score(lm_2, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)

array([-1411.36652748, -1397.77531321, -1401.90459745, -1394.05434212,
       -1406.99373782])

# Random Forest

In [23]:
rf = Pipeline([
    # Pre-processing pipeline
    ("preparation", full_pipeline),
    # Random forest
    ("rf", RandomForestRegressor(random_state=123))])

In [24]:
rf.fit(X_train, y_train)

In [25]:
rf.score(X_train, y_train)

0.9452213719925228

In [26]:
y_pred_rf = rf.predict(X_train)

In [27]:
mean_squared_error(y_train, y_pred_rf, squared=False)

722.3641601276025

## Cross validation

In [21]:
rf_2 = Pipeline([
    # Pre-processing pipeline
    ("preparation", full_pipeline),
    # Decision Tree (defaul parameters)
    ("rf", RandomForestRegressor(random_state=123))])

In [22]:
cross_val_score(rf_2, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)

KeyboardInterrupt: 

In [88]:
y_test_pred_rf = rf.predict(X_test)
mean_squared_error(y_test, y_test_pred_rf, squared=False)

283807.8438618152