In [1]:
import pandas as pd
import os

DATASET_DIR = "../datasets"

files = [
    "ml_ev_charging_dataset.csv",
    "charger_info_mel.csv",
    "clustered_suburbs.csv",
    "Co-oridnates.csv",
    "Info_for_PCZ.csv",
    "optimal_prices_all_suburbs.csv",
    "road_congestion.csv",
    "stations_per_town.csv",
    "Suburb_Population.csv",
    "vehicle_registrations.csv"
]

In [2]:
import numpy as np

path_charger = os.path.join(DATASET_DIR, "charger_info_mel.csv")
charger = pd.read_csv(path_charger)

charger.head()

Unnamed: 0,Charger Name,Address,Suburb,State,Postal Code,Power (kW),Usage Cost,Number of Points,Connection Types,latitude,longitude
0,RMIT - City Campus,17-21 Cardigan Street,carlton,VIC,3053,"75, 22",AUD 0.30 per kWh,1.0,"33, 25",-37.800423,144.968434
1,100 St Kilda Rd,100 St Kilda Rd,southbank,AU-VIC,AU,,,,1,-37.825362,144.96402
2,11 Nicholson Street,11 Nicholson Street,carlton,AU-VIC,AU,,,,1,-37.800423,144.968434
3,Lorbek Luxury Cars,30 Prohasky Street,port melbourne,Victoria,3207,11,$0.31/kWh,8.0,1036,-37.833361,144.92192
4,Tesla Supercharger South Melbourne,Clarendon Street,south melbourne,VIC,,250,,6.0,33,-37.83344,144.957053


In [3]:
df = charger.copy()

# Normalize State column: uppercase + trim spaces
df["State_clean"] = (
    df["State"]
    .astype(str)
    .str.strip()
    .str.upper()
)

# Replace inconsistent labels
df["State_clean"] = df["State_clean"].replace({
    "VIC": "VIC",
    "VICTORIA": "VIC",
    "AU-VIC": "VIC",
    "NEW SOUTH WALES": "NSW"
})

# Convert "NAN" string into actual NaN
df["State_clean"] = df["State_clean"].replace("NAN", np.nan)


In [4]:
def clean_power(x):
    """Split multi-value power column and extract the maximum numeric value."""
    if pd.isna(x):
        return np.nan
    parts = str(x).split(",")
    nums = []
    for p in parts:
        try:
            nums.append(float(p.strip()))
        except:
            pass
    return max(nums) if nums else np.nan

df["Power_clean"] = df["Power (kW)"].apply(clean_power)


In [5]:
# Extract numeric values from various text formats ($0.31/kWh, AUD 0.30 per kWh, etc.)
df["Usage_Cost_clean"] = (
    df["Usage Cost"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .str.extract(r"(\d+\.?\d*)")[0]
)

df["Usage_Cost_clean"] = pd.to_numeric(df["Usage_Cost_clean"], errors="coerce")
# Display cleaned DataFrame
df.head()

Unnamed: 0,Charger Name,Address,Suburb,State,Postal Code,Power (kW),Usage Cost,Number of Points,Connection Types,latitude,longitude,State_clean,Power_clean,Usage_Cost_clean
0,RMIT - City Campus,17-21 Cardigan Street,carlton,VIC,3053,"75, 22",AUD 0.30 per kWh,1.0,"33, 25",-37.800423,144.968434,VIC,75.0,0.3
1,100 St Kilda Rd,100 St Kilda Rd,southbank,AU-VIC,AU,,,,1,-37.825362,144.96402,VIC,,
2,11 Nicholson Street,11 Nicholson Street,carlton,AU-VIC,AU,,,,1,-37.800423,144.968434,VIC,,
3,Lorbek Luxury Cars,30 Prohasky Street,port melbourne,Victoria,3207,11,$0.31/kWh,8.0,1036,-37.833361,144.92192,VIC,11.0,0.31
4,Tesla Supercharger South Melbourne,Clarendon Street,south melbourne,VIC,,250,,6.0,33,-37.83344,144.957053,VIC,250.0,


In [6]:
# Extract digits only (removes AU, text, etc.)
df["Postal_clean"] = (
    df["Postal Code"]
    .astype(str)
    .str.extract(r"(\d+)")
)

df["Postal_clean"] = pd.to_numeric(df["Postal_clean"], errors="coerce")


In [7]:
# Replace missing with 1 and convert to integer
df["Number_of_Points_clean"] = df["Number of Points"].fillna(1).astype(int)


In [8]:
def count_types(x):
    """Count how many connector types exist in the entry."""
    if pd.isna(x):
        return 0
    parts = str(x).split(",")
    return len(parts)

df["Connection_Types_count"] = df["Connection Types"].apply(count_types)


In [9]:
df = df.dropna(subset=["latitude", "longitude"])


In [10]:
train_data = df[df["State_clean"].notna()]
test_data  = df[df["State_clean"].isna()]

X_train = train_data[["latitude", "longitude"]]
y_train = train_data["State_clean"]

X_test = test_data[["latitude", "longitude"]]


In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

pred_state = knn.predict(X_test)


In [12]:
df.loc[df["State_clean"].isna(), "State_clean"] = pred_state

In [13]:
df_clean = df[[
    "Charger Name", "Suburb", "State_clean", "Postal_clean",
    "Power_clean", "Usage_Cost_clean", "Number_of_Points_clean",
    "Connection_Types_count", "latitude", "longitude"
]]

df_clean.head()

Unnamed: 0,Charger Name,Suburb,State_clean,Postal_clean,Power_clean,Usage_Cost_clean,Number_of_Points_clean,Connection_Types_count,latitude,longitude
0,RMIT - City Campus,carlton,VIC,3053.0,75.0,0.3,1,2,-37.800423,144.968434
1,100 St Kilda Rd,southbank,VIC,,,,1,1,-37.825362,144.96402
2,11 Nicholson Street,carlton,VIC,,,,1,1,-37.800423,144.968434
3,Lorbek Luxury Cars,port melbourne,VIC,3207.0,11.0,0.31,8,1,-37.833361,144.92192
4,Tesla Supercharger South Melbourne,south melbourne,VIC,,250.0,,6,1,-37.83344,144.957053


In [14]:
df_clean.to_csv("charger_clean.csv", index=False)