In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load your engineered dataset
df = pd.read_csv("uk_stations_final_engineered.csv")

print("Dataset loaded. Shape:", df.shape)
df.head()


Dataset loaded. Shape: (10000, 39)


Unnamed: 0,ocm_id,operator,status,is_operational,address1,address2,town,state_province,postcode,country,...,stations_per_borough,operational_flag,usage_Private - Restricted Access,usage_Privately Owned - Notice Required,usage_Public,usage_Public - Membership Required,usage_Public - Pay At Location,connector_count,avg_power_per_connector,has_fast_charger
0,253415,Ev Dot,Not Operational,False,Rainsford Road,,Chelmsford,England,CM1 2XB,GB,...,7950,0,False,False,False,False,True,1.0,7.0,0
1,4396,Independent Operator,Operational,True,Spring Garden,Westminster,London,London,SW1A 2BN,GB,...,476,1,False,False,True,False,False,,,0
2,52877,Bp Pulse (Uk),Operational,True,Spring Gardens,City of Westminster,London,London,SW1A 2TS,GB,...,476,1,False,False,False,True,False,1.0,7.0,0
3,146490,Virta,Operational,True,440 Strand,Covent Garden,London,London,WC2R 0QS,GB,...,476,1,False,False,False,False,False,1.0,7.0,0
4,4399,Bp Pulse (Uk),Operational,True,Whitcomb Street,Westminster,London,London,WC2H 7DT,GB,...,476,1,False,False,False,True,False,1.0,4.0,0


In [80]:
# STEP 1: DROP IRRELEVANT COLUMNS

drop_cols = [
    "ocm_id", "address1", "address2", "town", "state_province",
    "postcode", "country", "title", "submission_status",
    "data_provider", "connection_statuses",
    "power_category", "all_connector_powers_kw"
]

df = df.drop(columns=drop_cols, errors="ignore")
print("\nIrrelevant columns dropped.")
print("Shape after dropping:", df.shape)



Irrelevant columns dropped.
Shape after dropping: (10000, 26)


In [92]:
# STEP 2: HANDLE MISSING VALUES

num_cols_to_fill = [
    "avg_power_per_connector",
    "uk_avg_energy_kWh",
    "uk_avg_util_pct",
    "num_points",
    "connector_count"
]

for col in num_cols_to_fill:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

df = df.fillna(0)  # final safety fill
print("\nMissing values handled.")
print(df.isna().sum().sum(), "total remaining NaNs")



Missing values handled.
0 total remaining NaNs


In [94]:
# REMOVING CONSTANT COLUMNS

cols_to_remove = [
    "uk_usage_rows",
    "last_status_update",
    "last_verified",
    "connector_count"   
]

df = df.drop(columns=cols_to_remove, errors="ignore")

print("\nRemoved non-useful columns:")
print(cols_to_remove)
print("New shape after removal:", df.shape)


Removed non-useful columns:
['uk_usage_rows', 'last_status_update', 'last_verified', 'connector_count']
New shape after removal: (10000, 22)


In [96]:
# STEP 3: SELECTING FEATURES FOR CLUSTERING

cluster_features = [
    "max_power_kw",
    "avg_power_per_connector",
    "has_fast_charger",
    "stations_per_borough",
    "borough_density_km2",
    "latitude",
    "longitude"
]
# keep only columns that exist
cluster_features = [c for c in cluster_features if c in df.columns]
print("\nSelected clustering features:")
print(cluster_features)


Selected clustering features:
['max_power_kw', 'avg_power_per_connector', 'has_fast_charger', 'stations_per_borough', 'borough_density_km2', 'latitude', 'longitude']


In [102]:
# STEP 4 — SCALLING FEATURES FOR K-MEANS++

scaler = StandardScaler()
X_cluster = scaler.fit_transform(df[cluster_features])

print("\nScaling complete!")
print("X_cluster shape:", X_cluster.shape)


Scaling complete!
X_cluster shape: (10000, 7)


In [114]:
# SAVING PREPROCESSED DATASET
output_path = "uk_stations_preprocessed.csv"
df.to_csv(output_path, index=False)

print("Preprocessed dataset saved successfully!")
print("Saved to:", output_path)


print("\n Data preprocessing completed successfully!")
print("final df shape:", df.shape)
df.head()

Preprocessed dataset saved successfully!
Saved to: uk_stations_preprocessed.csv

 Data preprocessing completed successfully!
final df shape: (10000, 22)


Unnamed: 0,operator,status,is_operational,latitude,longitude,num_points,connector_types,max_power_kw,borough,borough_density_km2,...,priority_score,stations_per_borough,operational_flag,usage_Private - Restricted Access,usage_Privately Owned - Notice Required,usage_Public,usage_Public - Membership Required,usage_Public - Pay At Location,avg_power_per_connector,has_fast_charger
0,Ev Dot,Not Operational,False,51.507351,-0.127758,1.0,Type 2 (Socket Only); Type 2 (Socket Only),7.0,Outer London,9318.948649,...,0.131437,7950,0,False,False,False,False,True,7.0,0
1,Independent Operator,Operational,True,51.507291,-0.128896,1.0,0,4.8,Westminster,13608.4,...,0.209204,476,1,False,False,True,False,False,4.8,0
2,Bp Pulse (Uk),Operational,True,51.507099,-0.130117,8.0,BS1363 3 Pin 13 Amp; Type 2 (Socket Only),7.0,Westminster,13608.4,...,0.22101,476,1,False,False,False,True,False,7.0,0
3,Virta,Operational,True,51.508903,-0.125534,1.0,Type 2 (Socket Only),7.0,Westminster,9318.948649,...,0.131437,476,1,False,False,False,False,False,7.0,0
4,Bp Pulse (Uk),Operational,True,51.509162,-0.13065,4.0,Type 2 (Socket Only),4.0,Westminster,13608.4,...,0.212507,476,1,False,False,False,True,False,4.0,0


In [112]:
df.columns

Index(['operator', 'status', 'is_operational', 'latitude', 'longitude',
       'num_points', 'connector_types', 'max_power_kw', 'borough',
       'borough_density_km2', 'uk_avg_util_pct', 'uk_avg_energy_kWh',
       'priority_score', 'stations_per_borough', 'operational_flag',
       'usage_Private - Restricted Access',
       'usage_Privately Owned - Notice Required', 'usage_Public',
       'usage_Public - Membership Required', 'usage_Public - Pay At Location',
       'avg_power_per_connector', 'has_fast_charger'],
      dtype='object')