In [139]:
import pandas as pd
import numpy as np
from datetime import datetime
import uuid

In [141]:
USA_DATA = "C:/Users/DuminduS/Desktop/UWE/CSCT Masters Project/datasets/EV_DATA_USA.csv"     
BRISTOL_EV_STATIONS = "C:/Users/DuminduS/Desktop/UWE/CSCT Masters Project/datasets/EV_STATIONS_BRISTOL.csv"

In [143]:
OUTPUT_SESSIONS = "synthetic_sessions_bristol.csv"
OUTPUT_DAILY = "synthetic_daily_bristol.csv"

In [145]:
# date range for data
START_DATE = "2024-01-01"
END_DATE   = "2024-12-31"

In [147]:
DEFAULT_CONNECTORS_PER_STATION = 4  

In [149]:
us = pd.read_csv(USA_DATA)

In [150]:
print(us.head())  

         Date Station Name                                      Location Name  \
0  08/21/2025       101013  JGU - Jerome Gun Hill Road Municipal Parking G...   
1  08/21/2025       101014  JGU - Jerome Gun Hill Road Municipal Parking G...   
2  08/21/2025       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   
3  08/21/2025       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   
4  08/21/2025       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   

  Country Charge Box ID  Connector ID                             Driver ID  \
0     USA        101013             1  4a37c773-2997-472d-a5e1-0dfc26bf4883   
1     USA        101014             1  7fa6b416-4196-4120-b6bd-409e1ad65ba6   
2     USA        101016             1  97b09152-a9c2-4113-a7ab-aef174f958ca   
3     USA        101016             1  43d4f216-5ed2-42b6-bafe-116d2b71f214   
4     USA        101016             1  00b32c8f-fe48-46ee-a69d-8303d03db5c1   

                 ID Tag    Connected T

In [153]:
us.columns = [c.strip().lower().replace(" ", "_") for c in us.columns]

In [155]:
print(us.head())  

         date station_name                                      location_name  \
0  08/21/2025       101013  JGU - Jerome Gun Hill Road Municipal Parking G...   
1  08/21/2025       101014  JGU - Jerome Gun Hill Road Municipal Parking G...   
2  08/21/2025       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   
3  08/21/2025       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   
4  08/21/2025       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   

  country charge_box_id  connector_id                             driver_id  \
0     USA        101013             1  4a37c773-2997-472d-a5e1-0dfc26bf4883   
1     USA        101014             1  7fa6b416-4196-4120-b6bd-409e1ad65ba6   
2     USA        101016             1  97b09152-a9c2-4113-a7ab-aef174f958ca   
3     USA        101016             1  43d4f216-5ed2-42b6-bafe-116d2b71f214   
4     USA        101016             1  00b32c8f-fe48-46ee-a69d-8303d03db5c1   

                 id_tag    connected_t

In [157]:
#Parse DATE
us["date"] = pd.to_datetime(us["date"], errors="coerce", format="%m/%d/%Y")

In [159]:
print(us.head()) 

        date station_name                                      location_name  \
0 2025-08-21       101013  JGU - Jerome Gun Hill Road Municipal Parking G...   
1 2025-08-21       101014  JGU - Jerome Gun Hill Road Municipal Parking G...   
2 2025-08-21       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   
3 2025-08-21       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   
4 2025-08-21       101016  JGU - Jerome Gun Hill Road Municipal Parking G...   

  country charge_box_id  connector_id                             driver_id  \
0     USA        101013             1  4a37c773-2997-472d-a5e1-0dfc26bf4883   
1     USA        101014             1  7fa6b416-4196-4120-b6bd-409e1ad65ba6   
2     USA        101016             1  97b09152-a9c2-4113-a7ab-aef174f958ca   
3     USA        101016             1  43d4f216-5ed2-42b6-bafe-116d2b71f214   
4     USA        101016             1  00b32c8f-fe48-46ee-a69d-8303d03db5c1   

                 id_tag    connected_time di

In [161]:
len(us) 

202682

In [163]:
# Drop rows where date not parsed
us = us.dropna(subset=["date"])

In [165]:
len(us) 

202682

In [167]:
# Create day_of_week from DATE 
us["day_of_week"] = us["date"].dt.dayofweek   # Monday=0, Sunday=6

In [169]:
# generic EV daily profile.

def extract_hour_safe(x):
    """
    Try to interpret connected_time as H:MM or HH:MM or H:MM.S.
    If the first chunk > 23, return NaN.
    """
    try:
        s = str(x)
        if pd.isna(x) or s.strip() == "":
            return np.nan
        parts = s.split(":")
        h = int(parts[0])
        if h > 23: 
            return np.nan
        return h
    except Exception:
        return np.nan

if "connected_time" in us.columns:
    us["hour_of_day_raw"] = us["connected_time"].apply(extract_hour_safe)
else:
    us["hour_of_day_raw"] = np.nan

In [171]:
# manual daily profile.
valid_hours = us["hour_of_day_raw"].dropna()
have_hour_info = len(valid_hours) > 0

In [173]:
# Charge duration
if "charge_duration_(min)" in us.columns:
    us["charge_duration_min"] = pd.to_numeric(us["charge_duration_(min)"], errors="coerce")
elif "charge_duration_min" in us.columns:
    us["charge_duration_min"] = pd.to_numeric(us["charge_duration_min"], errors="coerce")
else:
    # approximate 1h session if nothing is there
    us["charge_duration_min"] = 60.0


In [175]:
#  Energy (kWh)
if "energy_provided_(kwh)" in us.columns:
    us["energy_kwh"] = pd.to_numeric(us["energy_provided_(kwh)"], errors="coerce")
else:
    # approximate energy from duration with assumed 7 kW power
    us["energy_kwh"] = (us["charge_duration_min"] / 60.0) * 7.0


In [177]:
#cleaning: drop bad values
us = us[(us["charge_duration_min"] > 2) & (us["charge_duration_min"] < 24 * 60)]
us = us[(us["energy_kwh"] > 0) & (us["energy_kwh"] < 200)]


In [179]:
# Keep only needed columns
us = us[["date", "day_of_week", "charge_duration_min", "energy_kwh", "hour_of_day_raw"]]

In [181]:

# ESTIMATE PATTERNS FROM U.S. DATA
# Hour-of-day arrival distribution
if have_hour_info:
    hour_counts = us["hour_of_day_raw"].value_counts().sort_index()
    hour_counts = hour_counts.reindex(range(24), fill_value=0)
    if hour_counts.sum() > 0:
        hour_probs = (hour_counts / hour_counts.sum()).values
    else:
        have_hour_info = False

if not have_hour_info:
    # manually defined typical EV charging daily profile
    hour_profile = np.array([
        0.02, 0.01, 0.01, 0.01, 0.02, 0.03,  # 0-5
        0.04, 0.06, 0.07, 0.07, 0.06, 0.05,  # 6-11
        0.05, 0.06, 0.07, 0.08, 0.08, 0.08,  # 12-17
        0.07, 0.06, 0.05, 0.04, 0.03, 0.02   # 18-23
    ])
    hour_probs = hour_profile / hour_profile.sum()

In [183]:
# Day-of-week intensity multipliers
dow_counts = us["day_of_week"].value_counts().sort_index()
dow_counts = dow_counts.reindex(range(7), fill_value=0)
if dow_counts.sum() == 0 or dow_counts.nunique() == 1:
    #neutral multipliers
    dow_multipliers = np.ones(7)
else:
    dow_mean = dow_counts.mean()
    dow_multipliers = (dow_counts / dow_mean).values
    # extreme values just in case
    dow_multipliers = np.clip(dow_multipliers, 0.5, 1.5)

In [185]:
# Distributions for charge duration and energy per session
duration_samples = us["charge_duration_min"].clip(lower=5, upper=600).values
energy_samples   = us["energy_kwh"].clip(lower=1, upper=150).values

In [187]:
# Base sessions per connector per day

if "charge_box_id" in us.columns:
    box_col = "charge_box_id"
elif "station_name" in us.columns:
    box_col = "station_name"
else:
    box_col = None

if box_col is not None and box_col in us.columns:
    tmp = us.groupby([box_col, "date"]).size().reset_index(name="sessions")
    box_daily = tmp.groupby(box_col)["sessions"].mean()
    if len(box_daily) > 0:
        avg_sessions_per_box_per_day = box_daily.median()
    else:
        avg_sessions_per_box_per_day = 5.0
else:
    total_days = (us["date"].max() - us["date"].min()).days + 1
    total_sessions = len(us)
    avg_sessions_per_box_per_day = total_sessions / max(total_days, 1)

In [189]:
# Clamp to realistic range 
base_sessions_per_connector_per_day = float(np.clip(avg_sessions_per_box_per_day, 2, 12))

In [191]:
print("U.S. patterns learned:")
print("  Base sessions per connector per day:", round(base_sessions_per_connector_per_day, 2))
print("  Hour probabilities:", np.round(hour_probs, 4))
print("  DOW multipliers:", np.round(dow_multipliers, 3))

U.S. patterns learned:
  Base sessions per connector per day: 12.0
  Hour probabilities: [0.0361 0.0364 0.0301 0.0241 0.0201 0.0215 0.0247 0.0321 0.0541 0.0466
 0.0474 0.0465 0.0458 0.0462 0.0472 0.049  0.0509 0.0535 0.055  0.055
 0.0544 0.0476 0.041  0.0345]
  DOW multipliers: [0.972 0.947 0.972 1.01  1.054 1.038 1.008]


In [193]:
# LOAD & PREP BRISTOL STATIONS

bristol = pd.read_csv("C:/Users/DuminduS/Desktop/UWE/CSCT Masters Project/datasets/EV_STATIONS_BRISTOL.csv")
bristol.columns = [c.strip().lower().replace(" ", "_") for c in bristol.columns]

In [195]:
# expected columns: city, country_code, state_province, latitude, longitude, ports, is_fast_dc

required_cols = {"city", "country_code", "state_province",
                 "latitude", "longitude", "ports", "is_fast_dc"}
missing = required_cols - set(bristol.columns)
if missing:
    raise ValueError(f"Missing expected columns in Bristol file: {missing}")

In [197]:
# Create a station_id 
bristol["station_id"] = [f"BRI_{i:04d}" for i in range(len(bristol))]

In [199]:
print(bristol.head()) 

      city country_code state_province   latitude  longitude  ports  \
0  Bristol           GB        UNKNOWN  51.542927  -2.568751      6   
1  Bristol           GB        England  51.460377  -2.587608      2   
2  Bristol           GB        UNKNOWN  51.547550  -2.557864      4   
3  Bristol           GB        UNKNOWN  51.480312  -2.612530      1   
4  Bristol           GB        UNKNOWN  51.443274  -2.560203      1   

   is_fast_dc station_id  
0       False   BRI_0000  
1       False   BRI_0001  
2       False   BRI_0002  
3       False   BRI_0003  
4        True   BRI_0004  


In [201]:
len(bristol) 

145

In [203]:
# Number of connectors/ports
bristol["n_connectors"] = pd.to_numeric(bristol["ports"], errors="coerce")
bristol["n_connectors"] = bristol["n_connectors"].fillna(DEFAULT_CONNECTORS_PER_STATION)
bristol.loc[bristol["n_connectors"] <= 0, "n_connectors"] = DEFAULT_CONNECTORS_PER_STATION
bristol["n_connectors"] = bristol["n_connectors"].astype(int)

In [205]:
# Connector type from is_fast_dc flag
bristol["is_fast_dc"] = bristol["is_fast_dc"].astype(int)  # if it's 0/1
bristol["connector_type"] = np.where(bristol["is_fast_dc"] == 1,
                                     "DC-fast",
                                     "AC")

In [207]:
# Keep only what the simulator needs
bristol = bristol[["station_id", "latitude", "longitude",
                   "n_connectors", "connector_type"]]

In [209]:
#SYNTHETIC GENERATOR

def simulate_sessions_for_station(row,
                                  start_date,
                                  end_date,
                                  base_sessions_per_connector_per_day,
                                  hour_probs,
                                  dow_multipliers,
                                  duration_samples,
                                  energy_samples):
    """
    Generate synthetic sessions for one Bristol station using patterns from U.S. data.
    """
    station_id = row["station_id"]
    n_connectors = int(row["n_connectors"])
    latitude = row["latitude"]
    longitude = row["longitude"]
    connector_type = row["connector_type"]

    rng = np.random.default_rng(abs(hash(station_id)) % (2**32))
    
    # station-level random effect: some stations busier, some quieter
    
    station_multiplier = rng.lognormal(mean=0.0, sigma=0.3)

    dates = pd.date_range(start_date, end_date, freq="D")
    records = []

    for current_date in dates:
        dow = current_date.dayofweek  # 0â€“6
        lambda_day = (base_sessions_per_connector_per_day *
                      n_connectors *
                      dow_multipliers[dow] *
                      station_multiplier)

        n_sessions = rng.poisson(lam=lambda_day)
        if n_sessions <= 0:
            continue

        for _ in range(n_sessions):
            
            # choose hour according to profile
            hour = int(rng.choice(np.arange(24), p=hour_probs))
            minute = int(rng.integers(0, 60))
            second = int(rng.integers(0, 60))

            start_time = pd.Timestamp(current_date.year, current_date.month, current_date.day,
                                      hour, minute, second)

            duration_min = float(rng.choice(duration_samples))
            energy_kwh = float(rng.choice(energy_samples))

            end_time = start_time + pd.Timedelta(minutes=duration_min)

            records.append({
                "session_id": str(uuid.uuid4()),
                "station_id": station_id,
                "latitude": latitude,
                "longitude": longitude,
                "connector_type": connector_type,
                "n_connectors_station": n_connectors,
                "start_time": start_time,
                "end_time": end_time,
                "charge_duration_min": duration_min,
                "energy_kwh": energy_kwh,
                "hour_of_day": start_time.hour,
                "day_of_week": start_time.dayofweek,
                "date": start_time.date(),
            })

    return pd.DataFrame.from_records(records)

In [211]:
# RUN GENERATOR FOR ALL STATIONS

all_sessions = []
for _, row in bristol.iterrows():
    df_s = simulate_sessions_for_station(
        row=row,
        start_date=START_DATE,
        end_date=END_DATE,
        base_sessions_per_connector_per_day=base_sessions_per_connector_per_day,
        hour_probs=hour_probs,
        dow_multipliers=dow_multipliers,
        duration_samples=duration_samples,
        energy_samples=energy_samples,
    )
    all_sessions.append(df_s)

synthetic_sessions = pd.concat(all_sessions, ignore_index=True)

In [212]:
# CREATE DAILY AGGREGATES (for forecasting)


daily = (synthetic_sessions
         .groupby(["station_id", "date"], as_index=False)
         .agg(
             sessions=("session_id", "count"),
             energy_kwh=("energy_kwh", "sum"),
             mean_charge_duration_min=("charge_duration_min", "mean")
         ))

daily = daily.merge(
    bristol[["station_id", "latitude", "longitude", "n_connectors"]],
    on="station_id", how="left"
)

In [None]:
# SAVE OUTPUTS

synthetic_sessions.to_csv(OUTPUT_SESSIONS, index=False)
daily.to_csv(OUTPUT_DAILY, index=False)

print(f"Saved session-level synthetic data to: {OUTPUT_SESSIONS}")
print(f"Saved daily per-station data to: {OUTPUT_DAILY}")

In [216]:
synthetic_sessions.to_csv("C:/Users/DuminduS/Desktop/UWE/CSCT Masters Project/datasets/synthetic_sessions_bristol.csv", index=False)
daily.to_csv("C:/Users/DuminduS/Desktop/UWE/CSCT Masters Project/datasets/synthetic_daily_bristol.csv", index=False)

print("CSV files saved:")
print(" - synthetic_sessions_bristol.csv")
print(" - synthetic_daily_bristol.csv")

CSV files saved:
 - synthetic_sessions_bristol.csv
 - synthetic_daily_bristol.csv


In [218]:
len(synthetic_sessions)

1659227

In [220]:
len(daily)

53067