# Data Preprocessing
This notebook handles:
- Splitting the Dataset
- Scaling Numerical Features
- Encoding Categorical Features

In [6]:
import pandas as pd

df = pd.read_csv("../data/ENGINEERED_Melbourne_Housing_Market.csv")
df["SaleDate"] = pd.to_datetime(df["SaleDate"])
df.head()

Unnamed: 0,SaleDate,CouncilArea,RealEstateAgent,RegionName,SaleMethod,StreetName,StreetType,Suburb,UnitType,AvgRoomSize,...,DistanceToCBD,LandSize,Latitude,Longitude,NeighbouringProperties,Postcode,PropertyAge,Rooms,YearBuilt,Price
0,2016-03-12,Yarra_City,Biggin,Northern_Metropolitan,Sold,Turner,Street,Abbotsford,House,66.5,...,2.5,202.0,-37.7996,144.9984,4019.0,3067.0,46.0,2.0,1970.0,1480000.0
1,2016-04-02,Yarra_City,Biggin,Northern_Metropolitan,Sold,Bloomburg,Street,Abbotsford,House,39.5,...,2.5,156.0,-37.8079,144.9934,4019.0,3067.0,116.0,2.0,1900.0,1035000.0
2,2017-04-03,Yarra_City,Biggin,Northern_Metropolitan,Sold_Prior,Charles,Street,Abbotsford,House,50.0,...,2.5,134.0,-37.8093,144.9944,4019.0,3067.0,117.0,3.0,1900.0,1465000.0
3,2017-04-03,Yarra_City,Biggin,Northern_Metropolitan,Passed_In,Federation,Lane,Abbotsford,House,44.333333,...,2.5,94.0,-37.7969,144.9969,4019.0,3067.0,47.0,3.0,1970.0,850000.0
4,2016-04-06,Yarra_City,Nelson,Northern_Metropolitan,Vendor_Bid,Park,Street,Abbotsford,House,35.5,...,2.5,120.0,-37.8072,144.9941,4019.0,3067.0,2.0,4.0,2014.0,1600000.0


## Splitting the Dataset

In [7]:
from sklearn.model_selection import train_test_split

y = df["Price"]
X = df.drop(columns=["Price"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((21797, 24), (5450, 24), (21797,), (5450,))

## Scaling Numerical Features
## Encoding Categorical Features

In [8]:
from utils.exploratory_data_analysis import get_custom_description

get_custom_description(df)

Unnamed: 0,SaleDate,CouncilArea,RealEstateAgent,RegionName,SaleMethod,StreetName,StreetType,Suburb,UnitType,AvgRoomSize,...,DistanceToCBD,LandSize,Latitude,Longitude,NeighbouringProperties,Postcode,PropertyAge,Rooms,YearBuilt,Price
Cardinality,,33.0,345.0,8.0,5.0,6724.0,83.0,343.0,3.0,,...,,,,,,,,,,
Majority,,Boroondara_City,Nelson,Southern_Metropolitan,Sold,The,Street,Reservoir,House,,...,,,,,,,,,,
MajorityPercentage,,0.09,0.1,0.31,0.64,0.01,0.49,0.03,0.68,,...,,,,,,,,,,
Min,2016-01-28 00:00:00,,,,,,,,,0.0,...,0.0,0.0,-38.19,144.42,83.0,3000.0,-2.0,1.0,1196.0,85000.0
Max,2018-10-03 00:00:00,,,,,,,,,8903.0,...,48.1,433014.0,-37.4,145.53,21650.0,3978.0,821.0,16.0,2019.0,11200000.0
Range,979 days 00:00:00,,,,,,,,,8903.0,...,48.1,433014.0,0.79,1.1,21567.0,978.0,823.0,15.0,823.0,11115000.0
Skewness,,,,,,,,,,115.04,...,1.48,111.33,-0.42,-0.44,1.02,3.98,2.19,0.51,-2.19,2.59
IQR,378 days 00:00:00,,,,,,,,,23.33,...,7.6,241.0,0.08,0.09,6118.0,107.0,2.0,2.0,0.0,660000.0
IQROutliers,,,,,,,,,,1127.0,...,1188.0,992.0,1723.0,2210.0,727.0,794.0,10592.0,23.0,10890.0,1278.0
Kurtosis,,,,,,,,,,16411.85,...,3.48,15040.91,2.72,2.79,0.92,21.95,38.8,2.81,38.55,13.1


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

numerical_features = X.select_dtypes(include="number").columns
low_card_cat_features = [column for column in X.select_dtypes(include="object").columns if X[column].nunique() <= 10]
high_card_cat_features = [column for column in X.select_dtypes(include="object").columns if X[column].nunique() > 10]

preprocessor = ColumnTransformer(transformers=[
    ("scaler", StandardScaler(), numerical_features),
    ("low_card_encoder", OneHotEncoder(handle_unknown="ignore"), low_card_cat_features),
    ("high_card_encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), high_card_cat_features)
])
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
X_train.shape, X_test.shape

((21797, 36), (5450, 36))

In [13]:
import joblib

joblib.dump(preprocessor, "../raw/preprocessor.pkl")

['../utils/preprocessor.pkl']

## Train and Test Data Saving

In [14]:
X_train_df = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
X_test_df = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())
y_train_df = pd.DataFrame(y_train)
y_test_df = pd.DataFrame(y_test)
X_train_df.shape, X_test_df.shape, y_train_df.shape, y_test_df.shape

((21797, 36), (5450, 36), (21797, 1), (5450, 1))

In [15]:
X_train_df.to_csv("../data/split_data/X_train.csv", index=False)
X_test_df.to_csv("../data/split_data/X_test.csv", index=False)
y_train_df.to_csv("../data/split_data/y_train.csv", header=False, index=False)
y_test_df.to_csv("../data/split_data/y_test.csv", header=False, index=False)