# Synthetic Dataset Generation

This notebook generates a synthetic civic complaint dataset to simulate
large-scale urban scenarios where real-world data availability is limited.
The generated data preserves spatial clustering and status variability
for realistic experimentation.

In [1]:
import pandas as pd
import numpy as np
import random

In [3]:
BASE_LAT = 23.2599
BASE_LON = 77.4126

In [5]:
def generate_cluster(center_lat, center_lon, n_points, spread=0.002):
    lats = np.random.normal(center_lat, spread, n_points)
    lons = np.random.normal(center_lon, spread, n_points)
    return lats, lons

In [7]:
clusters = [
    (23.2599, 77.4126, 400),   # City center
    (23.2700, 77.4300, 350),   # Residential
    (23.2500, 77.4000, 300),   # Industrial
    (23.2400, 77.4200, 250),   # Market area
]

In [9]:
all_lats = []
all_lons = []

for lat, lon, size in clusters:
    lats, lons = generate_cluster(lat, lon, size)
    all_lats.extend(lats)
    all_lons.extend(lons)

# Add noise points (isolated complaints)
noise_points = 300
noise_lats = np.random.uniform(BASE_LAT - 0.03, BASE_LAT + 0.03, noise_points)
noise_lons = np.random.uniform(BASE_LON - 0.03, BASE_LON + 0.03, noise_points)

all_lats.extend(noise_lats)
all_lons.extend(noise_lons)

In [11]:
categories = [
    "Waste",
    "Pothole",
    "Streetlight",
    "Drainage",
    "Water Supply"
]

statuses = ["Pending", "Completed", "Cancelled"]

data_size = len(all_lats)

category_col = np.random.choice(
    categories,
    size=data_size,
    p=[0.30, 0.25, 0.20, 0.15, 0.10]
)

status_col = []

for cat in category_col:
    if cat in ["Waste", "Drainage"]:
        status_col.append(
            np.random.choice(statuses, p=[0.6, 0.3, 0.1])
        )
    else:
        status_col.append(
            np.random.choice(statuses, p=[0.4, 0.5, 0.1])
        )

In [13]:
df_synthetic = pd.DataFrame({
    "ID": range(1, data_size + 1),
    "Category": category_col,
    "Status": status_col,
    "Latitude": all_lats,
    "Longitude": all_lons
})

In [19]:
df_synthetic.to_csv(
    "data/raw/urban_civic_reports_synthetic.csv",
    index=False
)

df_synthetic.head()

Unnamed: 0,ID,Category,Status,Latitude,Longitude
0,1,Drainage,Pending,23.257167,77.41364
1,2,Waste,Completed,23.258523,77.415287
2,3,Pothole,Completed,23.256676,77.411646
3,4,Waste,Completed,23.26152,77.407977
4,5,Streetlight,Pending,23.257405,77.412233
