# Data Preprocessing
This notebook handles:
- Splitting the Dataset
- Scaling Numerical Features
- Encoding Categorical Features

In [8]:
import pandas as pd

df = pd.read_csv("../data/ANALYSED_Melbourne_Housing_Market.csv")
df["SaleDate"] = pd.to_datetime(df["SaleDate"])
df.head()

Unnamed: 0,SaleDate,CouncilArea,RealEstateAgent,RegionName,Suburb,AvgRoomSize,Bathrooms,Bedrooms,BuildingArea,BuildingToLandRatio,...,Postcode,PropertyAge,Rooms,SaleDay,SaleDayOfWeek,SaleMonth,SaleQuarter,SaleYear,YearBuilt,Price
0,2016-03-12,Yarra_City,Biggin,Northern_Metropolitan,Other,66.5,1.0,2.0,133.0,0.658416,...,3067.0,46.0,2.0,12,5,3,1,2016,1970.0,1480000.0
1,2017-04-03,Yarra_City,Biggin,Northern_Metropolitan,Other,44.333333,2.0,3.0,133.0,1.414894,...,3067.0,47.0,3.0,3,0,4,2,2017,1970.0,850000.0
2,2016-04-06,Yarra_City,Nelson,Northern_Metropolitan,Other,35.5,1.0,3.0,142.0,1.183333,...,3067.0,2.0,4.0,6,2,4,2,2016,2014.0,1600000.0
3,2016-07-05,Yarra_City,Jellis,Northern_Metropolitan,Other,66.5,1.0,2.0,133.0,0.734807,...,3067.0,46.0,2.0,5,1,7,3,2016,1970.0,941000.0
4,2016-07-05,Yarra_City,Nelson,Northern_Metropolitan,Other,70.0,2.0,4.0,210.0,0.857143,...,3067.0,106.0,3.0,5,1,7,3,2016,1910.0,1876000.0


## Splitting the Dataset

In [9]:
from sklearn.model_selection import train_test_split

y = df["Price"]
X = df.drop(columns=["Price"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((18001, 25), (4501, 25), (18001,), (4501,))

## Scaling Numerical Features
## Encoding Categorical Features

In [10]:
from utils.exploratory_data_analysis import get_custom_description

get_custom_description(df)

Unnamed: 0,SaleDate,CouncilArea,RealEstateAgent,RegionName,Suburb,AvgRoomSize,Bathrooms,Bedrooms,BuildingArea,BuildingToLandRatio,...,Postcode,PropertyAge,Rooms,SaleDay,SaleDayOfWeek,SaleMonth,SaleQuarter,SaleYear,YearBuilt,Price
Cardinality,,28.0,33.0,8.0,32.0,,,,,,...,,,,,,,,,,
Majority,,Boroondara_City,Other,Southern_Metropolitan,Other,,,,,,...,,,,,,,,,,
MajorityPercentage,,0.09,0.16,0.33,0.64,,,,,,...,,,,,,,,,,
Min,2016-01-28 00:00:00,,,,,0.0,0.0,1.0,0.0,0.0,...,3000.0,-2.0,1.0,1.0,0.0,1.0,1.0,2016.0,1906.0,85000.0
Max,2018-10-03 00:00:00,,,,,207.0,3.0,5.0,286.0,3.59,...,3338.0,111.0,5.0,30.0,6.0,12.0,4.0,2018.0,2019.0,2770000.0
Range,979 days 00:00:00,,,,,207.0,3.0,4.0,286.0,3.59,...,338.0,113.0,4.0,29.0,6.0,11.0,3.0,2.0,113.0,2685000.0
Skewness,,,,,,2.29,1.16,-0.04,0.67,2.21,...,0.25,0.33,0.05,-0.09,-1.32,-0.15,-0.17,0.17,-0.34,1.08
IQR,378 days 00:00:00,,,,,24.0,1.0,0.0,0.0,0.18,...,105.0,2.0,1.0,15.0,1.0,6.0,2.0,1.0,0.0,609000.0
IQROutliers,,,,,,901.0,0.0,8880.0,7806.0,3386.0,...,19.0,7808.0,722.0,0.0,4907.0,0.0,0.0,0.0,8045.0,656.0
Kurtosis,,,,,,7.42,0.42,0.91,5.39,7.15,...,-1.1,2.27,-0.2,-1.12,0.22,-1.17,-1.23,-0.62,2.28,0.97


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

numerical_features = X.select_dtypes(include="number").columns
low_card_cat_features = [column for column in X.select_dtypes(include="object").columns if X[column].nunique() <= 10]
high_card_cat_features = [column for column in X.select_dtypes(include="object").columns if X[column].nunique() > 10]

preprocessor = ColumnTransformer(transformers=[
    ("scaler", StandardScaler(), numerical_features),
    ("low_card_encoder", OneHotEncoder(handle_unknown="ignore"), low_card_cat_features),
    ("high_card_encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), high_card_cat_features)
])
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X_train.shape, X_test.shape

((18001, 31), (4501, 31))

In [12]:
import joblib

joblib.dump(preprocessor, "../raw/preprocessor.pkl")

['../raw/preprocessor.pkl']

## Train and Test Data Saving

In [13]:
X_train_df = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
X_test_df = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())
y_train_df = pd.DataFrame(y_train)
y_test_df = pd.DataFrame(y_test)
X_train_df.shape, X_test_df.shape, y_train_df.shape, y_test_df.shape

((18001, 31), (4501, 31), (18001, 1), (4501, 1))

In [14]:
X_train_df.to_csv("../data/split_data/X_train.csv", index=False)
X_test_df.to_csv("../data/split_data/X_test.csv", index=False)
y_train_df.to_csv("../data/split_data/y_train.csv", header=False, index=False)
y_test_df.to_csv("../data/split_data/y_test.csv", header=False, index=False)