In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)


In [2]:
NUM_ROWS = 20000
NUM_ROUTES = 20
BUS_CAPACITY = 50


In [3]:
data = []

for _ in range(NUM_ROWS):
    hour = np.random.randint(0, 24)
    day_of_week = np.random.randint(0, 7)
    is_weekend = day_of_week >= 5

    is_peak = (7 <= hour <= 10) or (17 <= hour <= 20)

    route_id = np.random.randint(1, NUM_ROUTES + 1)
    stop_sequence = np.random.randint(1, 15)
    distance = np.round(np.random.uniform(0.5, 3.0), 2)

    traffic_level = np.random.choice([1, 2, 3], p=[0.4, 0.35, 0.25])
    rain_flag = np.random.choice([0, 1], p=[0.85, 0.15])

    base_time = distance * 2  # minutes
    delay = (
        traffic_level * 1.5 +
        rain_flag * 2 +
        is_peak * 3 +
        np.random.normal(0, 0.5)
    )

    actual_time = max(2, base_time + delay)

    passenger_count = int(
        np.random.normal(
            20 + is_peak * 15 + traffic_level * 5,
            5
        )
    )

    passenger_count = max(0, min(passenger_count, BUS_CAPACITY))

    occupancy = passenger_count / BUS_CAPACITY

    if occupancy < 0.4:
        crowding = "Low"
    elif occupancy < 0.75:
        crowding = "Medium"
    else:
        crowding = "High"

    data.append([
        hour, day_of_week, is_weekend,
        route_id, stop_sequence,
        distance, traffic_level,
        rain_flag, passenger_count,
        BUS_CAPACITY, actual_time, crowding
    ])


In [4]:
columns = [
    "hour",
    "day_of_week",
    "is_weekend",
    "route_id",
    "stop_sequence",
    "distance_to_next_stop_km",
    "traffic_level",
    "rain_flag",
    "passenger_count",
    "bus_capacity",
    "actual_travel_time_min",
    "crowding_level"
]

df = pd.DataFrame(data, columns=columns)
df.head()


Unnamed: 0,hour,day_of_week,is_weekend,route_id,stop_sequence,distance_to_next_stop_km,traffic_level,rain_flag,passenger_count,bus_capacity,actual_travel_time_min,crowding_level
0,6,3,False,15,11,2.45,2,0,25,50,7.852689,Medium
1,20,3,False,8,8,2.13,1,0,37,50,8.771111,Medium
2,9,3,False,16,14,0.73,2,0,46,50,7.447772,High
3,4,2,False,7,5,1.63,1,1,30,50,5.74664,Medium
4,20,1,False,20,12,1.6,1,0,33,50,8.111272,Medium


In [5]:
df.to_csv("../data/raw/synthetic_transport_data.csv", index=False)
print("Dataset saved successfully.")


Dataset saved successfully.


In [6]:
df.shape


(20000, 12)

In [7]:
df.describe()


Unnamed: 0,hour,day_of_week,route_id,stop_sequence,distance_to_next_stop_km,traffic_level,rain_flag,passenger_count,bus_capacity,actual_travel_time_min
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,11.4253,2.98265,10.55955,7.45085,1.753897,1.85135,0.1533,33.61625,50.0,7.590545
std,6.947779,1.998487,5.750531,4.021989,0.719748,0.793653,0.360285,9.116679,0.0,2.500707
min,0.0,0.0,1.0,1.0,0.5,1.0,0.0,6.0,50.0,2.0
25%,5.0,1.0,6.0,4.0,1.13,1.0,0.0,27.0,50.0,5.814697
50%,11.0,3.0,11.0,7.0,1.75,2.0,0.0,33.0,50.0,7.472581
75%,17.0,5.0,16.0,11.0,2.38,3.0,0.0,41.0,50.0,9.286716
max,23.0,6.0,20.0,14.0,3.0,3.0,1.0,50.0,50.0,16.192351


In [8]:
df["crowding_level"].value_counts(normalize=True)


crowding_level
Medium    0.61585
High      0.33730
Low       0.04685
Name: proportion, dtype: float64