In [1]:
import pandas as pd
import numpy as np

# =========================
# Load clustered suburb data
# =========================
PATH = "../datasets/clean/clustered_suburbs_final.csv"
df = pd.read_csv(PATH)

print("Shape ban đầu:", df.shape)

# =========================
# Chọn feature cho pricing
# =========================
pricing_cols = [
    "Suburb",
    "Cluster",
    "Cluster_Label",
    "Population",
    "Income",
    "Vehicles_per_Dwelling",
    "Public_Chargers",
    "Station_Count",
    "Avg_Congestion"
]

pricing_df = df[pricing_cols].copy()

# =========================
# Feature engineering đơn giản
# =========================
pricing_df["Demand_Index"] = (
    pricing_df["Population"] / pricing_df["Population"].median()
) * (
    pricing_df["Vehicles_per_Dwelling"] / pricing_df["Vehicles_per_Dwelling"].median()
)

pricing_df["Supply_Index"] = pricing_df["Public_Chargers"] + pricing_df["Station_Count"]

pricing_df["Demand_Supply_Ratio"] = pricing_df["Demand_Index"] / (pricing_df["Supply_Index"] + 1)

pricing_df = pricing_df.replace([np.inf, -np.inf], 0)

print("\nPreview pricing data:")
display(pricing_df.head())

pricing_df.to_csv("../datasets/clean/pricing_base.csv", index=False)
print("✅ Saved: pricing_base.csv")


Shape ban đầu: (354, 16)

Preview pricing data:


Unnamed: 0,Suburb,Cluster,Cluster_Label,Population,Income,Vehicles_per_Dwelling,Public_Chargers,Station_Count,Avg_Congestion,Demand_Index,Supply_Index,Demand_Supply_Ratio
0,Abbotsford,2,EV-Ready Suburbs,10294,2333.0,1.6,1,1,0.0,0.625806,2,0.208602
1,Airport West,2,EV-Ready Suburbs,8647,1761.0,1.7,1,1,0.0,0.558534,2,0.186178
2,Albert Park,1,High Population – Infrastructure Gap,17184,2076.0,1.1,0,0,0.0,0.718212,0,0.718212
3,Alphington - Fairfield,0,Affluent Car-Dependent (High Potential),9688,2112.0,1.5,0,0,0.0,0.552155,0,0.552155
4,Altona,2,EV-Ready Suburbs,14232,2123.0,1.8,2,2,0.0,0.973361,4,0.194672


✅ Saved: pricing_base.csv


In [2]:
# =========================
# Simulate rental usage
# =========================
np.random.seed(42)

pricing_df["Simulated_Usage"] = (
    pricing_df["Demand_Supply_Ratio"] * 10
    + pricing_df["Avg_Congestion"] * 0.5
    + np.random.normal(0, 1, len(pricing_df))
)

pricing_df["Simulated_Usage"] = pricing_df["Simulated_Usage"].clip(lower=0)

display(
    pricing_df[["Suburb", "Cluster_Label", "Simulated_Usage"]].head()
)

pricing_df.to_csv("../datasets/clean/pricing_with_usage.csv", index=False)
print("✅ Saved: pricing_with_usage.csv")


Unnamed: 0,Suburb,Cluster_Label,Simulated_Usage
0,Abbotsford,EV-Ready Suburbs,2.582734
1,Airport West,EV-Ready Suburbs,1.723517
2,Albert Park,High Population – Infrastructure Gap,7.829806
3,Alphington - Fairfield,Affluent Car-Dependent (High Potential),7.044579
4,Altona,EV-Ready Suburbs,1.712569


✅ Saved: pricing_with_usage.csv


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

# =========================
# Feature & target
# =========================
features = [
    "Population",
    "Income",
    "Vehicles_per_Dwelling",
    "Public_Chargers",
    "Station_Count",
    "Avg_Congestion",
    "Demand_Supply_Ratio"
]

X = pricing_df[features]
y = pricing_df["Simulated_Usage"]

# =========================
# Scale
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# =========================
# Train-test split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# =========================
# Neural Network
# =========================
model = MLPRegressor(
    hidden_layer_sizes=(32, 16),
    max_iter=500,
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print("MAE:", round(mae, 2))


MAE: 0.68




In [4]:
prices = np.linspace(5, 15, 50)

def optimal_price(row):
    revenues = []
    for p in prices:
        usage_est = row["Simulated_Usage"] * (10 / p)
        revenues.append(p * usage_est)
    return prices[np.argmax(revenues)]

pricing_df["Optimal_Price"] = pricing_df.apply(optimal_price, axis=1)

display(
    pricing_df[["Suburb", "Cluster_Label", "Optimal_Price"]].head()
)

pricing_df.to_csv("../datasets/clean/optimal_pricing_by_suburb.csv", index=False)
print("✅ Saved: optimal_pricing_by_suburb.csv")


Unnamed: 0,Suburb,Cluster_Label,Optimal_Price
0,Abbotsford,EV-Ready Suburbs,5.408163
1,Airport West,EV-Ready Suburbs,6.836735
2,Albert Park,High Population – Infrastructure Gap,5.0
3,Alphington - Fairfield,Affluent Car-Dependent (High Potential),5.0
4,Altona,EV-Ready Suburbs,5.0


✅ Saved: optimal_pricing_by_suburb.csv
