In [166]:
import pandas as pd
import numpy as np

df = pd.read_csv("uk_stations_enriched.csv")

# ===========================================================
# BASIC CLEANING
# ===========================================================

# Strip whitespace safely from ALL object columns
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype(str).str.strip()

# Fill basic missing values
df.fillna({
    "operator": "Unknown", "usage_type": "Unknown", "status": "Unknown",
    "is_operational": False, "address1": "Not Available", "address2": "",
    "town": "London", "state_province": "London", "postcode": "NA",
    "country": "GB", "title": "Unnamed Station", "connector_types": "Unknown",
    "connection_statuses": "Unknown", "borough": "Unknown"
}, inplace=True)

# Numeric cleanup
num_cols = ["max_power_kw", "num_points", "borough_density_km2",
            "uk_avg_util_pct", "uk_avg_energy_kWh", "priority_score"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce").fillna(df[num_cols].median())

# Dates
df["last_status_update"] = pd.to_datetime(df["last_status_update"], errors="ignore")
df["last_verified"] = pd.to_datetime(df["last_verified"], errors="ignore")

df.drop_duplicates(subset=["ocm_id"], inplace=True)

# ===========================================================
# INFERENCE LOGIC
# ===========================================================

# ---------------------------- Usage Type ----------------------------
df["usage_type"] = df["usage_type"].replace(
    ["(Unknown)", "Unknown", "unknown", ""], np.nan
)

# Public networks always considered Public
public_ops = [
    "BP Pulse (UK)", "POD Point (UK)", "Shell Recharge Solutions (UK)", "Charge Your Car",
    "Total Energies (UK)", "ESB Energy (UK)", "Char.gy", "ChargePoint",
    "Zero Carbon World", "The GeniePoint Network ( EQUANS EV Solutions )"
]
df.loc[df["operator"].isin(public_ops) & df["usage_type"].isna(), "usage_type"] = "Public"

# Tesla
df.loc[df["operator"].str.contains("Tesla", case=False, na=False) & df["usage_type"].isna(),
       "usage_type"] = "Private - Membership Required"

# Fast/Rapid chargers
df.loc[(df["max_power_kw"] >= 50) & df["usage_type"].isna(), "usage_type"] = "Public"

# Private locations (hotel, school, retail etc.)
private_words = ["Hotel", "Inn", "Car Park", "University", "School", "Retail",
                 "Tesco", "Asda", "Sainsbury"]
df.loc[
    df["title"].str.contains("|".join(private_words), case=False, na=False) &
    df["usage_type"].isna(),
    "usage_type"
] = "Private - For Staff, Visitors Or Customers"

# Final fallback
df["usage_type"] = df["usage_type"].fillna("Public")
df["usage_type"] = df["usage_type"].str.title()

# ---------------------------- Borough Mapping ----------------------------
df["postcode_clean"] = df["postcode"].str.replace(" ", "").str.upper()
df["pc_area"] = df["postcode_clean"].str.extract(r"^([A-Z]{1,2}\d{1,2})")

pc_map = {
    "EC1": "Islington", "EC2": "City Of London", "EC3": "City Of London", "EC4": "City Of London",
    "WC1": "Camden", "WC2": "Westminster", "E1": "Tower Hamlets", "E2": "Tower Hamlets",
    "E3": "Tower Hamlets", "E8": "Hackney", "E9": "Hackney", "E15": "Newham",
    "N1": "Islington", "N4": "Haringey", "N5": "Islington", "NW1": "Camden",
    "NW2": "Brent", "NW3": "Camden", "SE1": "Southwark", "SE11": "Lambeth",
    "SE15": "Southwark", "SW1": "Westminster", "SW3": "Kensington And Chelsea",
    "SW7": "Kensington And Chelsea", "W1": "Westminster", "W2": "Westminster",
    "W8": "Kensington And Chelsea", "HA1": "Harrow", "HA2": "Harrow"
}

df["borough"] = df.apply(lambda r: pc_map.get(r["pc_area"], r["borough"]), axis=1)
df["borough"] = df["borough"].replace(["Unknown", ""], np.nan).fillna("Outer London")
df["borough"] = df["borough"].str.title()

# ---------------------------- Status ----------------------------
df["status"] = df["status"].replace(["Unknown", ""], np.nan)

df.loc[df["is_operational"] == True, "status"] = "Operational"
df.loc[df["connection_statuses"].str.contains("Partially", case=False, na=False),
       "status"] = "Partly Operational (Mixed)"
df.loc[df["last_status_update"].isna(), "status"] = "Planned For Future Date"

df["status"] = df["status"].fillna("Operational")

# ---------------------------- Operator ----------------------------
df["operator"] = df["operator"].replace(
    ["(Unknown Operator)", "unknown", "Unknown", ""], "Independent Operator"
).str.title()

# ---------------------------- Sync is_operational ----------------------------
df["is_operational"] = df["status"].apply(lambda x: x in [
    "Operational", "Partly Operational (Mixed)"
])

# Cleanup
df.drop(columns=["postcode_clean", "pc_area"], inplace=True, errors="ignore")

#fixing state_province nan values
df["state_province"] = df["state_province"].fillna("London")
# Fix state_province based on borough
df["state_province"] = np.where(
    df["borough"].isin([
        "Westminster", "City Of London", "Lambeth", "Camden", "Southwark",
        "Kensington And Chelsea", "Islington", "Sutton", "Wandsworth",
        "Hillingdon", "Hackney", "Tower Hamlets", "Harrow",
        "Hammersmith And Fulham", "Brent", "Bromley", "Lewisham",
        "Haringey", "Merton", "Ealing", "Newham", "Greenwich",
        "Waltham Forest", "Croydon", "Barnet", "Harrow", "Enfield"
    ]),
    "London",
    "England"
)



# ===========================================================
# FINAL FIX â€” REMOVE ALL NaN IN STATUS & USAGE TYPE
# ===========================================================

# --- STATUS ---
df["status"] = df["status"].astype(str).str.strip()
df.loc[df["status"].isin(["nan", "NaN", "None", ""]), "status"] = np.nan

# If NaN â†’ assign based on is_operational
df.loc[df["is_operational"] == True,  "status"] = df["status"].fillna("Operational")
df.loc[df["is_operational"] == False, "status"] = df["status"].fillna("Not Operational")

df["status"] = df["status"].fillna("Operational")   # Final fallback


# --- USAGE TYPE ---
# FINAL FIX FOR USAGE TYPE (remove all 'Nan' variants)
df["usage_type"] = df["usage_type"].astype(str).str.strip()

# Replace ANY variation of nan text with real NaN
df["usage_type"] = df["usage_type"].replace(
    ["nan", "Nan", "NAN", "NaN", "None", ""],
    np.nan
)

# Fill missing with Public
df["usage_type"] = df["usage_type"].fillna("Public")

# Final formatting
df["usage_type"] = df["usage_type"].str.title()



# ===========================================================
# SAVE
# ===========================================================
df.to_csv("cleaned_uk_stations_enriched.csv", index=False)
print("ðŸ”¥ Final clean dataset created successfully! Rows:", len(df))


  df["last_status_update"] = pd.to_datetime(df["last_status_update"], errors="ignore")
  df["last_verified"] = pd.to_datetime(df["last_verified"], errors="ignore")


ðŸ”¥ Final clean dataset created successfully! Rows: 10000


In [168]:
df.head(10)

Unnamed: 0,ocm_id,operator,usage_type,status,is_operational,address1,address2,town,state_province,postcode,...,borough,borough_density_km2,uk_avg_util_pct,uk_avg_energy_kWh,uk_usage_rows,priority_score,last_status_update,last_verified,submission_status,data_provider
0,253415,Ev Dot,Public - Pay At Location,Not Operational,False,Rainsford Road,,Chelmsford,England,CM1 2XB,...,Outer London,9318.948649,50.216667,36.881026,78,0.131437,2023-05-04 08:44:00+00:00,2023-05-04 08:44:00+00:00,Imported and Published,UK National Charge Point Registry
1,4396,Independent Operator,Public,Operational,True,Spring Garden,Westminster,London,London,SW1A 2BN,...,Westminster,13608.4,50.216667,36.881026,78,0.209204,2011-05-17 17:23:00+00:00,2011-05-17 17:23:00+00:00,Submission Published,Open Charge Map Contributors
2,52877,Bp Pulse (Uk),Public - Membership Required,Operational,True,Spring Gardens,City of Westminster,London,London,SW1A 2TS,...,Westminster,13608.4,50.216667,36.881026,78,0.22101,2023-04-03 16:58:00+00:00,2023-04-03 16:58:00+00:00,Submission Published,Open Charge Map Contributors
3,146490,Virta,"Private - For Staff, Visitors Or Customers",Operational,True,440 Strand,Covent Garden,London,London,WC2R 0QS,...,Westminster,9318.948649,50.216667,36.881026,78,0.131437,2020-01-10 10:18:00+00:00,2020-01-10 10:18:00+00:00,Submission Published,Open Charge Map Contributors
4,4399,Bp Pulse (Uk),Public - Membership Required,Operational,True,Whitcomb Street,Westminster,London,London,WC2H 7DT,...,Westminster,13608.4,50.216667,36.881026,78,0.212507,2023-04-03 17:00:00+00:00,2023-04-03 17:00:00+00:00,Submission Published,Open Charge Map Contributors
5,170689,Pod Point (Uk),Public - Membership Required,Operational,True,Whitehall Place,,Westminster,London,SW1A 2BD,...,Westminster,13608.4,50.216667,36.881026,78,0.21279,2024-11-19 06:21:00+00:00,2024-11-19 06:21:00+00:00,Submission Published,Open Charge Map Contributors
6,104898,Bp Pulse (Uk),Public - Membership Required,Operational,True,Saint Martins Lane Hotel,,45 Saint Martin's Lane,London,WC2N 4HX,...,Westminster,9318.948649,50.216667,36.881026,78,0.176139,2023-08-23 10:23:00+00:00,2023-08-23 10:23:00+00:00,Submission Published,Open Charge Map Contributors
7,253530,Shell Recharge Solutions (Uk),Public - Pay At Location,Not Operational,False,Junc. Bedfordbury,,London,London,WC2N 4DQ,...,Westminster,9318.948649,50.216667,36.881026,78,0.129422,2023-05-04 09:46:00+00:00,2023-05-04 09:46:00+00:00,Imported and Published,UK National Charge Point Registry
8,99712,Pod Point (Uk),"Private - For Staff, Visitors Or Customers",Operational,True,5 - 7 Carlton Gardens,,London,London,SW1Y 5AD,...,Westminster,9318.948649,50.216667,36.881026,78,0.135547,2023-10-09 13:27:00+00:00,2023-10-09 13:27:00+00:00,Submission Published,Open Charge Map Contributors
9,107862,Bp Pulse (Uk),Public - Membership Required,Operational,True,St James Square,Westminster,London,London,SW1Y 4PD,...,Westminster,13608.4,50.216667,36.881026,78,0.21279,2019-03-05 06:24:00+00:00,2019-03-05 06:24:00+00:00,Submission Published,Open Charge Map Contributors


In [138]:
df["status"].value_counts(dropna=False)
df["is_operational"].value_counts(dropna=False)


is_operational
False    5793
True     4207
Name: count, dtype: int64

In [142]:
df["borough"].value_counts()


borough
Outer London              7950
Southwark                  530
Westminster                476
Camden                     177
Islington                  117
Hounslow                    94
Lambeth                     83
Merton                      71
Brent                       53
Haringey                    50
Kensington And Chelsea      49
Ealing                      46
Newham                      44
Greenwich                   39
Wandsworth                  33
City Of London              33
Tower Hamlets               29
Waltham Forest              27
Hackney                     24
Lewisham                    20
Hammersmith And Fulham      19
Croydon                     16
Bromley                     10
Hillingdon                   3
Barnet                       2
Sutton                       2
Enfield                      2
Harrow                       1
Name: count, dtype: int64

In [148]:
df["status"].value_counts(dropna=False)

status
Not Operational               5780
Operational                   4191
Partly Operational (Mixed)      16
Planned For Future Date         13
Name: count, dtype: int64

In [154]:
df["usage_type"].value_counts(dropna=False)

usage_type
Public - Pay At Location                      4265
Public - Membership Required                  2854
Public                                        2743
Private - For Staff, Visitors Or Customers     124
Private - Restricted Access                     13
Privately Owned - Notice Required                1
Name: count, dtype: int64

In [156]:
df["usage_type"].value_counts(dropna=False)

usage_type
Public - Pay At Location                      4265
Public - Membership Required                  2854
Public                                        2743
Private - For Staff, Visitors Or Customers     124
Private - Restricted Access                     13
Privately Owned - Notice Required                1
Name: count, dtype: int64