import libraries

In [9]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from time import time
from sklearn.tree import export_graphviz
import graphviz
import joblib
import os

In [10]:
df = pd.read_csv("/Users/supimraid/Desktop/DAC INTERNAL/Internal-ProjectsUpdated/Datasets/merged_df.csv", low_memory=False)


print(df.head())

        month        town flat_type block_no  storey_range  floor_area_sqm  \
0  2017-01-01  ang mo kio    2 ROOM      406            11            44.0   
1  2017-01-01  ang mo kio    3 ROOM      108             2            67.0   
2  2017-01-01  ang mo kio    3 ROOM      602             2            67.0   
3  2017-01-01  ang mo kio    3 ROOM      465             5            68.0   
4  2017-01-01  ang mo kio    3 ROOM      601             2            67.0   

       flat_model lease_commence_date     remaining_lease  resale_price  ...  \
0        Improved          1979-01-01  61 years 04 months      232000.0  ...   
1  New Generation          1978-01-01  60 years 07 months      250000.0  ...   
2  New Generation          1980-01-01  62 years 05 months      262000.0  ...   
3  New Generation          1980-01-01   62 years 01 month      265000.0  ...   
4  New Generation          1980-01-01  62 years 05 months      265000.0  ...   

   latitude   longitude           street_name post

categorizing the data

In [11]:
categorical_cols = [
    "town",
    "flat_type",
    "storey_range",
    "flat_model",
    "street_name",
    "nearest_mrt_station",
    "nearest_mall",
]

label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col])

# Convert datetime columns to Unix timestamps (seconds since epoch)
# This standardizes the date format and makes it usable for the model


In [12]:
df["lease_commence_date"] = (
    pd.to_datetime(df["lease_commence_date"]).astype(np.int64) // 10**9
)

X = df.drop(
    ["resale_price", "month", "address", "postal_code", "block_no", "remaining_lease"],
    axis=1,
)
y = df["resale_price"]

print(X.head())

   town  flat_type  storey_range  floor_area_sqm  flat_model  \
0     0          1             3            44.0           5   
1     0          2             0            67.0          12   
2     0          2             0            67.0          12   
3     0          2             1            68.0          12   
4     0          2             0            67.0          12   

   lease_commence_date  latitude   longitude  street_name  \
0            283996800  1.362005  103.853880           13   
1            252460800  1.370966  103.838202           16   
2            315532800  1.380709  103.835368           17   
3            315532800  1.366201  103.857201           13   
4            315532800  1.381041  103.835132           17   

   nearest_mrt_station  nearest_mrt_distance_km  nearest_mall  \
0                    1                 1.011433             2   
1                    1                 1.270031            11   
2                  120                 1.068607      

In [13]:
# Normalizing the encoded data
scaler = StandardScaler()
X = scaler.fit_transform(X)

test_dataframe = pd.DataFrame(X)

print(test_dataframe.head())

# test and train split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

# Decision Tree Regressor
dtr = DecisionTreeRegressor()

# hyperparameters
param_grid = {
    "max_depth": [2, 5, 10, 15, 20, 22],
    "min_samples_split": [2, 3, 4, 5],
    "min_samples_leaf": [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20],
    "max_features": ["auto", "sqrt", "log2"],
}

         0         1         2         3         4         5         6   \
0 -1.765066 -2.305364  0.377999 -2.205395 -1.007286 -1.214336 -0.141272   
1 -1.765066 -1.218420 -1.137970 -1.248992  0.931604 -1.285075  0.067823   
2 -1.765066 -1.218420 -1.137970 -1.248992  0.931604 -1.143598  0.295133   
3 -1.765066 -1.218420 -0.632647 -1.207409  0.931604 -1.143598 -0.043360   
4 -1.765066 -1.218420 -1.137970 -1.248992  0.931604 -1.143598  0.302891   

         7         8         9         10        11        12        13  \
0  0.176576 -1.573809 -1.711319  0.806775 -1.649323  0.880677 -0.889921   
1 -0.044069 -1.556499 -1.711319  1.409240 -1.381704  0.518500 -0.633896   
2 -0.083949 -1.550729  1.384552  0.939975 -1.381704  2.247837 -0.376732   
3  0.223315 -1.573809 -1.711319  0.656890  1.651304  0.583806 -0.778839   
4 -0.087277 -1.550729  1.384552  0.993059 -1.381704  2.362805 -0.367324   

         14  
0 -0.928129  
1 -0.999336  
2 -0.856923  
3 -0.856923  
4 -0.856923  


In [14]:
# gridsearchcv
print("\nStarting Grid Search...")
start_time = time()

# Add scoring metrics to track
scoring = {"r2": "r2", "neg_mean_squared_error": "neg_mean_squared_error"}
grid_search = GridSearchCV(
    estimator=dtr,
    param_grid=param_grid,
    cv=5, #5-fold cross-validation
    scoring=scoring,
    refit="r2", #only use r2 score for best model
    verbose=0,  # Change to 0 to minimize output
    n_jobs=-1,
)

grid_search.fit(X_train, y_train)

# Print only the essential information
print(f"\nGrid Search completed in {(time() - start_time):.2f} seconds")


Starting Grid Search...

Grid Search completed in 51.56 seconds


In [15]:
# Print detailed results
print("\nGrid Search Results:")
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# evalution metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Calculate percentage errors
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("\nError Metrics:")
print(f"Mean squared error: ${mse:,.2f}")
print(f"Mean Absolute Error: ${mae:,.2f}")
print(f"Root Mean squared error: ${rmse:,.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
print(f"R-squared: {r2:.4f}")


Grid Search Results:

Error Metrics:
Mean squared error: $2,540,968,383.80
Mean Absolute Error: $35,690.48
Root Mean squared error: $50,408.02
Mean Absolute Percentage Error: 7.20%
R-squared: 0.9203


Saves the best model to use for predicting values

In [16]:
# Save the best model and preprocessing components
print("\nSaving model and preprocessing components...")
model_dir = "../model/"
os.makedirs(model_dir, exist_ok=True)

# Save the model
joblib.dump(best_model, f"{model_dir}best_model.joblib")

# Save the preprocessing components
joblib.dump(label_encoders, f"{model_dir}label_encoders.joblib")
joblib.dump(scaler, f"{model_dir}scaler.joblib")
joblib.dump(categorical_cols, f"{model_dir}categorical_cols.joblib")

# Create and save a list of feature names in correct order
feature_names = (
    X.columns
    if isinstance(X, pd.DataFrame)
    else df.drop(
        [
            "resale_price",
            "month",
            "address",
            "postal_code",
            "block_no",
            "remaining_lease",
        ],
        axis=1,
    ).columns
)
joblib.dump(feature_names, f"{model_dir}feature_names.joblib")


print("\nModel and preprocessing components saved successfully!")

# Plot the decision tree
# Create dot data
dot_data = export_graphviz(
    best_model,
    out_file=None,
    feature_names=df.drop(
        [
            "resale_price",
            "month",
            "address",
            "postal_code",
            "block_no",
            "remaining_lease",
        ],
        axis=1,
    ).columns,
    filled=True,
    rounded=True,
    special_characters=True,
    max_depth=3,  # Limit depth for visibility
)

# Create and save the graph
graph = graphviz.Source(dot_data)
graph.render(
    "decision_tree_visualization",
    directory=model_dir,
    format="pdf",
    cleanup=True,
)
print(
    "\nDecision tree visualization has been saved as 'decision_tree_visualization.pdf'"
)


Saving model and preprocessing components...

Model and preprocessing components saved successfully!

Decision tree visualization has been saved as 'decision_tree_visualization.pdf'
