# Melbourne Housing Market

House Prices Prediction

The data is from Kaggle and can be found [here](https://www.kaggle.com/anthonypino/melbourne-housing-market)

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from joblib import dump
import pickle

# Define functions

In [2]:
def convert_to_others(df: pd.DataFrame, feature: str, N_counts: int):
    """
    df: data frame
    feature: feature to transform
    N_counts: categories with less than "N_counts" counts are converted to "others" 
    """

    df_count = (
        df
        .groupby(feature)[feature]
        .value_counts()
        .reset_index()
    )

    # Select the categories with less than N_counts
    df_other = df_count.loc[df_count["count"] < N_counts, feature]

    # Name for the new column with some categories converted to "others"
    new_col_name = feature + "_others"

    # Copy original column
    df[new_col_name] = df[feature]
    # Categories with less than "N_counts" counts (this info is #
    # stored in the data frame "df_other") are set to "others"
    df.loc[df[feature].isin(df_other), [new_col_name]] = "others"

    return df


def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])


def RMSPE(y, y_pred):
     rmspe = np.sum(((y - y_pred) / y)**2) / len(y)
     return rmspe


def print_best_model_metrics(gs, X, y):
    """"
    gs: fitted GridSearch object
    X: DataFrame with features
    y: actual target
    """
    print(f"Best parameters:\n{gs.best_params_}")
    print(f"\nBest score: {gs.best_score_:.3f}")
    print(f"RMSE: {np.sqrt(-1*gs.best_score_):.3f}")
    score = gs.score(X, y)
    print(f"\nneg_mean_squared_error on the full train set: {score:.3f}")
    print(f"RMSE on the full train set: {np.sqrt(-1*score):.3f}")
    y_pred = gs.predict(X)
    print(f"\nMean squared error = {mean_squared_error(y, y_pred, squared=False):.2f}")
    print(f"Root Mean Square Percentage Error: {RMSPE(y, y_pred):.2f}")

# Load data

In [3]:
# Load the dataset
def load_ds(path: Path, filename: str) -> pd.DataFrame:
    """Read the dataset csv file as a pandas dataframe."""
    return pd.read_csv(path / filename)

# Load dataset
dataset_path = Path().absolute() / "data"
filename = "X_y_train.csv"
X_y_train = load_ds(dataset_path, filename)

print(f"Shape: {X_y_train.shape}")

Shape: (397900, 18)


In [4]:
X_y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397900 entries, 0 to 397899
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Date                       397900 non-null  object 
 1   Store                      397900 non-null  float64
 2   DayOfWeek                  386039 non-null  float64
 3   Sales                      397900 non-null  float64
 4   Customers                  386030 non-null  float64
 5   Open                       385880 non-null  float64
 6   Promo                      386000 non-null  float64
 7   StateHoliday               385848 non-null  object 
 8   SchoolHoliday              385817 non-null  float64
 9   StoreType                  397900 non-null  object 
 10  Assortment                 397900 non-null  object 
 11  CompetitionDistance        396864 non-null  float64
 12  CompetitionOpenSinceMonth  271565 non-null  float64
 13  CompetitionOpenSinceYear   27

# Preprocessing pipeline

In [5]:
# From Jonathan

class MeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.means = {}

    def fit(self, X, y=None):
        for col in self.columns:
            self.means[col] = X.groupby(col, dropna=False)['Sales'].mean().rename(col + 'Mean')
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X = X.merge(self.means[col], on=col)
        return X

In [6]:

# TransformerMixin: add method ".fit_transform()"
# BaseEstimator: add methods ".get_params()" and ".set_params()"
# We need 3 methods:
# 1) .fit()
# 2) .transform()
# 3) .fit_transform() (provided by "TransformerMixin")
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # avoid "*args" or "**kargs" in "__init__"
    def __init__(self):
        self.mean_Month = pd.DataFrame()
        self.mean_Store = pd.DataFrame()
        self.mean_DayOfWeek = pd.DataFrame()
        self.mean_Promo_Store = pd.DataFrame()

    # fit is needed later for the pipilene
    def fit(self, X, y=None):
        # X["target_var"] = y

        # Date
        #self.means[col] = X.groupby(col, dropna=False)['Sales'].mean().rename(col + 'Mean')
        Date_2 = pd.to_datetime(X["Date"], format="%Y-%m-%d")
        X["Month"] = Date_2.dt.month
        #print(X.columns)
        self.mean_Month = self.mean_encode(X, "Month", "Sales")
        #
        ## self.mean_Month = (
        ##     # select columns
        ##     X.loc[:, ["Month", "Sales"]]
        ##     # group by feature
        ##     .groupby("Month")
        ##     # aggregate over feature using target mean
        ##     .agg(Month_mean=("Sales", np.mean))
        ##     # index (i.e., feature categories) as a column
        ##     .reset_index()
        ##     # rename the column with the aggregated means
        ##     #.rename(columns={"tmp_name":new_col_name})
        ## )
        #print(self.mean_Month)

        # Store
        self.mean_Store = self.mean_encode(X, "Store", "Sales")

        # DayOfWeek
        self.mean_DayOfWeek = self.mean_encode(X, "DayOfWeek", "Sales")

        # Promo (separately for each Store)
        self.mean_Promo_Store = self.mean_encode_2(X, "Promo", "Store", "Sales")

        # Drop the target
        #target_to_drop = ["Sales"]
        #X.drop(columns=target_to_drop, inplace=True)

        return self
    
    def transform(self, X):
        # Since I use MEAN ENCODING, "X" must include
        # the terget variable. Below, just before returning
        # the transformed X, the target variable is dropped.

        # Date
        Date_2 = pd.to_datetime(X["Date"], format="%Y-%m-%d")
        X["Month"] = Date_2.dt.month
        X = pd.merge(X, self.mean_Month, how="left", on="Month")
        # drop: "Date" and "Month"

        # Store
        #X = self.mean_encode(X, "Store", "Sales")
        X = pd.merge(X, self.mean_Store, how="left", on="Store")
        # drop: "Store"

        # DayOfWeek
        # X = self.mean_encode(X, "DayOfWeek", "Sales")
        X = pd.merge(X, self.mean_DayOfWeek, how="left", on="DayOfWeek")
        # drop: "DayOfWeek"

        # Promo (separately for each Store)
        # X = self.mean_encode_2(X, "Promo", "Store", "Sales")
        X = pd.merge(X, self.mean_Promo_Store, how="left", on=["Promo", "Store"])
        # drop: "Promo" and "Store"

        # SchoolHoliday
        X.loc[X.SchoolHoliday=="0", :] = 0.0
        # keep: "SchoolHoliday"

        # StoreType: keep, no transformation

        # Assortment: keep, no transformation

        # Promo2: keep, no transformation

        # CompetitionDistance
        nb = 10 # number of bins
        clip_upper = 10000
        X["CD_clip"] = X["CompetitionDistance"].clip(upper=clip_upper)
        CD_clip_bins = pd.cut(
            X["CD_clip"],
            bins=nb,
            labels=[i for i in range(nb)])
        X['CD_clip_bins'] = pd.to_numeric(CD_clip_bins)
        X["CD_clip_bins_clip"] = X["CD_clip_bins"].clip(upper=clip_upper) # 
        # drop: "CompetitionDistance", "CD_clip", "CD_clip_bins"

        # Drop unused columns
        cols_to_drop = [
            "Date", "Month", "Store", "DayOfWeek", "Customers", "Open", "Promo",
            "StateHoliday", "CompetitionDistance", "CD_clip", "CD_clip_bins",
            "CompetitionOpenSinceMonth", "CompetitionOpenSinceYear", "Promo2SinceWeek",
            "Promo2SinceYear", "PromoInterval"]
        X.drop(columns=cols_to_drop, inplace=True)

        # Drop the target
        with_target = sum([col == "Sales" for col in X.columns])
        if with_target > 0:
            target_to_drop = ["Sales"]
            X.drop(columns=target_to_drop, inplace=True)

        return X
    

    def mean_encode(self, df: pd.DataFrame, feature: str, target: str):
        """
        df: dataframe with "feature" and "target" columns
        feature: feature to transform
        target: target variable
        """
        new_col_name = feature + "_mean"
        df_enc = (
            # select columns
            df.loc[:, [feature, target]]
            # group by feature
            .groupby(feature)
            # aggregate over feature using target mean
            .agg(tmp_name=(target, np.mean))
            # index (i.e., feature categories) as a column
            .reset_index()
            # rename the column with the aggregated means
            .rename(columns={"tmp_name":new_col_name})
        )
    
        # merge: add the new column with the aggregated mean from
        # "df_enc" back into "df"
        # df_merged = pd.merge(df, df_enc, how="left", on=feature)
    
        return df_enc # df_merged
        
        
    def mean_encode_2(self, df: pd.DataFrame, feature1: str, feature2: str, target: str):
        """
        Same as "mean_encode" but with 2 features.
        df: dataframe with "feature" and "target" columns
        feature: feature to transform
        target: target variable
        """
        new_col_name = feature1 + feature2 + "_mean"
        df_enc = (
            # select columns
            df.loc[:, [feature1, feature2, target]]
            # group by feature
            .groupby([feature1, feature2])
            # aggregate over feature using target mean
            .agg(tmp_name = (target, np.mean))
            # index (i.e., feature categories) as a column
            .reset_index()
            # rename the column with the aggregated means
            .rename(columns={"tmp_name":new_col_name})
            )
    
        # merge: add the new column with the aggregated mean from
        # "df_enc" back into "df"
        # df_merged = pd.merge(df, df_enc, how="left", on=[feature1, feature2])
    
        return df_enc # df_merged

     

In [7]:
caa = CombinedAttributesAdder()
new_x = caa.fit_transform(X_y_train)
new_x.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397900 entries, 0 to 397899
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SchoolHoliday      385817 non-null  float64
 1   StoreType          397900 non-null  object 
 2   Assortment         397900 non-null  object 
 3   Promo2             397900 non-null  int64  
 4   Month_mean         397900 non-null  float64
 5   Store_mean         397900 non-null  float64
 6   DayOfWeek_mean     386039 non-null  float64
 7   PromoStore_mean    386000 non-null  float64
 8   CD_clip_bins_clip  396864 non-null  float64
dtypes: float64(6), int64(1), object(2)
memory usage: 27.3+ MB


In [8]:
(new_x.isna().sum() / new_x.shape[0]).reset_index()

Unnamed: 0,index,0
0,SchoolHoliday,0.030367
1,StoreType,0.0
2,Assortment,0.0
3,Promo2,0.0
4,Month_mean,0.0
5,Store_mean,0.0
6,DayOfWeek_mean,0.029809
7,PromoStore_mean,0.029907
8,CD_clip_bins_clip,0.002604


In [9]:
# Numerical pipeline
#
# All (except the last) estimators must be transformers (i.e., they
# must have a ".fit_transform()" method).
num_pipeline = Pipeline([
    # replace NA with mean
    ('imputer', SimpleImputer(strategy='mean')),
    # standardize the variables: z = (x - mean) / SD
    ('std_scaler', StandardScaler())])

In [10]:
# Categorical pipeline
#
# All (except the last) estimators must be transformers (i.e., they
# must have a ".fit_transform()" method).
cat_pipeline = Pipeline([
    # replace NA with mode
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # apply "OneHotEncoder()"
    ('one_hot', OneHotEncoder(drop='if_binary'))])



In [11]:
list_num_attribs = ["SchoolHoliday", "Promo2", "Month_mean", "Store_mean",
                    "DayOfWeek_mean", "PromoStore_mean", "CD_clip_bins_clip"]
list_cat_attribs = ["StoreType", "Assortment"]

In [12]:
# ColumnTransformer requires tuples with:
# - a name
# - a transformer
# - a list of names (or indices) of columns to which the transformer is applied

cols_transformer = ColumnTransformer([
    # apply "num_pipeline" to numerical columns
    ('num', num_pipeline, list_num_attribs),
    # apply "cat_pipeline" to categorical columns
    ('cat', cat_pipeline, list_cat_attribs)])

In [13]:
full_pipeline = Pipeline([
    # transform/add columns
    ('attribs_adder', CombinedAttributesAdder()),
    # Transform numerical and categorical attributes
    ("cols_transformer", cols_transformer)])

# Linear regression

In [14]:
X_train = X_y_train # include "Sales", CombinedAttributesAdder() drops it
y_train = X_y_train.loc[:, "Sales"].copy()

# Random Forest

In [15]:
rf = Pipeline([
    # Pre-processing pipeline
    ("preparation", full_pipeline),
    # Random forest
    ("rf", RandomForestRegressor(random_state=123))])

In [16]:
rf.fit(X_train, y_train)

In [17]:
rf.score(X_train, y_train)

0.9452213719925228

In [18]:
y_pred_rf = rf.predict(X_train)

In [19]:
mean_squared_error(y_train, y_pred_rf, squared=False)

722.3641601276025

In [20]:
np.sqrt(np.sum((y_train -y_pred_rf)**2) / len(y_train))

722.3641601276025

# Test set

In [21]:
# Load dataset
dataset_path = Path().absolute() / "data"
filename = "X_y_test.csv"
X_y_test = load_ds(dataset_path, filename)

print(f"Shape: {X_y_test.shape}")

# X_test = X_y_test.drop("Sales") # include "Sales", CombinedAttributesAdder() drops it
X_test = X_y_test.drop(["Sales"], axis=1)
y_test = X_y_test.loc[:, "Sales"].copy()

print(f"shape X_train: {X_test.shape}")
print(f"shape y_train: {y_test.shape}")

Shape: (99476, 18)
shape X_train: (99476, 17)
shape y_train: (99476,)


In [22]:
def metric(preds, actuals):
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [23]:
y_pred = rf.predict(X_test)

In [24]:
metric(y_pred, y_test)

17.812073957516375

In [25]:
mean_squared_error(y_test, y_pred, squared=False)

1161.8867954288526

In [26]:
RMSPE(y_test, y_pred)

0.03172699786680336

# Save model

In [27]:
# open a file, where you ant to store the data
file = open('models/random_forest_final', 'wb')

# dump information to that file
pickle.dump(rf, file)

# close the file
file.close()