In [43]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# load dataset
df = pd.read_csv('../data/merged_data.csv', converters={'remaining_lease': simplify_remaining_lease})

# preview dataset
df.head()


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,86
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0,86
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0,86
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0,86
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0,85


In [41]:
def simplify_remaining_lease(val):
    try:
        remaining_lease_value = str(val).strip().lower()
        if remaining_lease_value.isdigit():
            return int(remaining_lease_value)
        if 'year' in remaining_lease_value:
            return int(remaining_lease_value.split('year')[0].strip())

    except:
        pass

    return np.nan


In [45]:
df['remaining_lease'].unique()


array([ 86,  85,  87,  88,  93,  89,  94,  90,  91,  95,  81,  92,  82,
        78,  84,  80,  83,  76,  79,  77,  97,  96,  98,  75, 100,  99,
        74,  73,  72, 101,  71,  70,  69,  68,  67,  66,  65,  64,  63,
        62,  61,  60,  59,  58,  57,  56,  55,  54,  53,  52,  51,  50,
        49,  48,  47,  46,  45,  44,  43,  42,  41,  40])

In [33]:
X = df.drop(columns=['resale_price'])
y = df['resale_price']

# encoding
categorical_columns = ['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'flat_model']
numerical_columns = ['floor_area_sqm', 'lease_commence_date', 'remaining_lease']

# pipeline preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)


In [35]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

# split dataset to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) # different test sizes are being tried

pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).

  copying or concatenating `_RemainderColsList`.


In [37]:
# perform prediction
y_prediction = pipeline.predict(X_test)

# evaluation of performance
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_prediction))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_prediction))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_prediction)))
print("R-squared (R²):", r2_score(y_test, y_prediction))


Mean Absolute Error (MAE): 21717.90774039324
Mean Squared Error (MSE): 1083226455.5246894
Root Mean Squared Error (RMSE): 32912.405799708555
R-squared (R²): 0.9654824809284988


Result with test_size=0.2: 
Mean Absolute Error (MAE): 20331.781453556046
Mean Squared Error (MSE): 936648403.9744359
Root Mean Squared Error (RMSE): 30604.712120430668
R-squared (R²): 0.9701746845329499

Result with test_size=0.3:
Mean Absolute Error (MAE): 20679.060814943598
Mean Squared Error (MSE): 966716392.8521676
Root Mean Squared Error (RMSE): 31092.06318101402
R-squared (R²): 0.9691618726336353

Result with test_size=0.5: 
Mean Absolute Error (MAE): 21717.90774039324
Mean Squared Error (MSE): 1083226455.5246894
Root Mean Squared Error (RMSE): 32912.405799708555
R-squared (R²): 0.9654824809284988
