In [1]:
import pandas as pd
import numpy as np
import datetime
import random
import os

# --- 1. CONFIGURATION ---
PUNE_AREAS = {
    'Hinjewadi': {'lat': 18.59, 'lon': 73.74, 'type': 'commercial_it'},
    'Kothrud': {'lat': 18.51, 'lon': 73.81, 'type': 'residential'},
    'FC_Road': {'lat': 18.52, 'lon': 73.84, 'type': 'shopping_dining'},
    'Mandai': {'lat': 18.51, 'lon': 73.86, 'type': 'market'},
    'Pune_Station': {'lat': 18.53, 'lon': 73.87, 'type': 'transport_hub'},
    'Viman_Nagar': {'lat': 18.57, 'lon': 73.91, 'type': 'commercial_mixed'}
}

EVENT_CALENDAR = {
    '2025-01-01': 'public_holiday', '2025-01-14': 'public_holiday',
    '2025-01-25': 'stadium_event', '2025-01-26': 'public_holiday'
}

N_SLOTS_PER_AREA = 50
SIM_START_DATE = datetime.datetime(2025, 1, 1)
SIM_END_DATE = datetime.datetime(2025, 2, 1)
SIM_TICK_MINUTES = 15
VEHICLE_TYPES = ['car', 'bike', 'large_vehicle', 'disabled']
OUTPUT_FILENAME = 'pune_parking_hyper_realistic.csv'

# --- 2. World Generation ---
def get_simulated_poi_data(area_type):
    # Using non-random values for strong, clear patterns
    if area_type == 'commercial_it':
        return {'poi_office_count': 30, 'poi_restaurant_count': 5, 'poi_store_count': 2}
    if area_type == 'residential':
        return {'poi_office_count': 1, 'poi_restaurant_count': 2, 'poi_store_count': 2}
    if area_type == 'shopping_dining':
        return {'poi_office_count': 5, 'poi_restaurant_count': 30, 'poi_store_count': 25}
    if area_type == 'market':
        return {'poi_office_count': 2, 'poi_restaurant_count': 10, 'poi_store_count': 40}
    return {'poi_office_count': 10, 'poi_restaurant_count': 15, 'poi_store_count': 15}

def generate_parking_slots(areas, n_per_area):
    print(f"Generating {n_per_area * len(areas)} parking slots...")
    slots_list = []
    slot_counter = 1
    for area_name, details in areas.items():
        for i in range(n_per_area):
            poi_data = get_simulated_poi_data(details['type'])
            slots_list.append({
                'slot_id': f"{area_name[:4].upper()}-{slot_counter:03d}",
                'area_name': area_name, 'area_type': details['type'],
                'lat': details['lat'] + random.uniform(-0.005, 0.005),
                'lon': details['lon'] + random.uniform(-0.005, 0.005),
                'slot_type': np.random.choice(VEHICLE_TYPES, p=[0.70, 0.20, 0.05, 0.05]),
                **poi_data
            })
            slot_counter += 1
    return pd.DataFrame(slots_list).set_index('slot_id')

# --- 3. Simulation Logic ---
def get_daily_weather(date):
    day_hash = date.day * 3
    if day_hash % 10 == 0: return 'rainy'
    if day_hash % 10 in (1, 2): return 'hot'
    return 'sunny'

def get_parking_duration(poi_data, time, current_weather):
    h, w = time.hour, time.weekday()
    if poi_data['poi_office_count'] >= 20 and 8 <= h <= 11 and w < 5:
        dur = np.random.normal(9.0, 0.5) # Commuter
    elif poi_data['poi_office_count'] < 5 and (19 <= h <= 23 or w >= 5):
        dur = np.random.normal(12.0, 1.0) # Resident
    elif poi_data['poi_restaurant_count'] >= 20 or poi_data['poi_store_count'] >= 20:
        dur = np.random.normal(2.0, 0.5) # Shopper
    else:
        dur = np.random.normal(1.0, 0.2)
    if current_weather == 'rainy': dur *= 1.5
    return max(0.5, dur)

# --- ðŸš¨ THIS FUNCTION IS NOW FIXED ---
def get_arrival_probability(time, poi_data, current_weather, event_today):
    h, w = time.hour, time.weekday()
    prob = 0.05 # Low base chance
    
    # --- POI/Time Logic ---
    if poi_data['poi_office_count'] >= 20: # IT Area
        if 8 <= h <= 11 and w < 5: prob = 0.95
        elif 17 <= h <= 20 and w < 5: prob = 0.05
        elif w >= 5: prob = 0.02
    elif poi_data['poi_office_count'] < 5 and poi_data['poi_store_count'] < 5: # Residential
        if 18 <= h <= 23: prob = 0.80
        elif 9 <= h <= 17 and w < 5: prob = 0.10
    elif poi_data['poi_restaurant_count'] >= 20: # Dining Area
        if 12 <= h <= 14: prob = 0.70
        if 19 <= h <= 22: prob = 0.90
        
    # --- Event & Weather Logic (FIXED HIERARCHY) ---
    if event_today == 'public_holiday':
        # Holiday logic OVERRIDES weather
        if poi_data['poi_office_count'] >= 20: 
            prob = 0.01 # Offices are CLOSED
        if poi_data['poi_restaurant_count'] >= 20 or poi_data['poi_store_count'] >= 20:
            prob = min(0.98, prob * 2.0) # Shops are PACKED, even in rain
            
    elif current_weather == 'rainy':
        # Rain only matters if it's NOT a holiday
        if poi_data['poi_store_count'] >= 20 or poi_data['poi_restaurant_count'] >= 20:
            prob *= 0.4 # Rain deters shoppers on a normal day
            
    return min(max(prob, 0.01), 0.99)

# --- 4. Main Simulation (The Fixed Version) ---
def run_simulation(slots_df, start_date, end_date, tick_minutes):
    print("Starting simulation with STATISTICAL SAMPLING FIX...")
    occupancy_log = []
    slot_status = {slot_id: None for slot_id in slots_df.index} # Stores departure time
    current_time = start_date
    tick_delta = datetime.timedelta(minutes=tick_minutes)
    weather_cache = {}
    
    while current_time < end_date:
        curr_day = current_time.date()
        if curr_day not in weather_cache:
            weather_cache[curr_day] = get_daily_weather(curr_day)
        weather = weather_cache[curr_day]
        event = EVENT_CALENDAR.get(curr_day.isoformat(), 'none')
        
        if current_time.hour == 0 and current_time.minute == 0:
            print(f"  Simulating: {curr_day} (Weather: {weather}, Event: {event})")
        
        for slot_id in slots_df.index:
            # 1. Check for departures
            if slot_status[slot_id] and current_time >= slot_status[slot_id]:
                slot_status[slot_id] = None 
            
            # 2. Check for arrivals
            if not slot_status[slot_id]:
                poi = slots_df.loc[slot_id, ['poi_office_count', 'poi_restaurant_count', 'poi_store_count']].to_dict()
                prob = get_arrival_probability(current_time, poi, weather, event)
                
                if random.random() < prob:
                    dur = get_parking_duration(poi, current_time, weather)
                    departure_time = current_time + datetime.timedelta(hours=dur)
                    slot_status[slot_id] = departure_time
            
            # 3. Log the FINAL state (THE CRUCIAL STEP)
            occupancy_log.append({
                'timestamp': current_time,
                'slot_id': slot_id,
                'is_occupied': 1 if slot_status[slot_id] is not None else 0,
                'weather': weather,
                'event_type': event
            })
            
        current_time += tick_delta
        
    print("Simulation complete.")
    return pd.DataFrame(occupancy_log)

# --- 5. Post-Processing ---
def create_final_dataset(log_df, slots_df):
    print("Creating final dataset...")
    final_df = log_df.merge(slots_df, left_on='slot_id', right_index=True)
    final_df['timestamp'] = pd.to_datetime(final_df['timestamp'])
    final_df['hour'] = final_df['timestamp'].dt.hour
    final_df['weekday'] = final_df['timestamp'].dt.weekday
    final_df['is_weekend'] = final_df['weekday'].apply(lambda x: 1 if x >= 5 else 0)
    final_df = final_df[[
        'timestamp', 'slot_id', 'is_occupied', 'lat', 'lon', 
        'area_name', 'area_type', 'slot_type',
        'hour', 'weekday', 'is_weekend',
        'weather', 'event_type',
        'poi_office_count', 'poi_restaurant_count', 'poi_store_count'
    ]]
    return final_df

# --- Main execution ---
if __name__ == "__main__":
    if os.path.exists(OUTPUT_FILENAME):
        print(f"Deleting old file: {OUTPUT_FILENAME}")
        os.remove(OUTPUT_FILENAME)
        
    slots_df = generate_parking_slots(PUNE_AREAS, N_SLOTS_PER_AREA)
    log_df = run_simulation(slots_df, SIM_START_DATE, SIM_END_DATE, SIM_TICK_MINUTES)
    
    if not log_df.empty:
        final_df = create_final_dataset(log_df, slots_df)
        final_df.to_csv(OUTPUT_FILENAME, index=False)
        print(f"\nSUCCESS! Generated {len(final_df)} events.")
    else:
        print("Simulation failed to produce data.")

Generating 300 parking slots...
Starting simulation with STATISTICAL SAMPLING FIX...
  Simulating: 2025-01-01 (Weather: sunny, Event: public_holiday)
  Simulating: 2025-01-02 (Weather: sunny, Event: none)
  Simulating: 2025-01-03 (Weather: sunny, Event: none)
  Simulating: 2025-01-04 (Weather: hot, Event: none)
  Simulating: 2025-01-05 (Weather: sunny, Event: none)
  Simulating: 2025-01-06 (Weather: sunny, Event: none)
  Simulating: 2025-01-07 (Weather: hot, Event: none)
  Simulating: 2025-01-08 (Weather: sunny, Event: none)
  Simulating: 2025-01-09 (Weather: sunny, Event: none)
  Simulating: 2025-01-10 (Weather: rainy, Event: none)
  Simulating: 2025-01-11 (Weather: sunny, Event: none)
  Simulating: 2025-01-12 (Weather: sunny, Event: none)
  Simulating: 2025-01-13 (Weather: sunny, Event: none)
  Simulating: 2025-01-14 (Weather: hot, Event: public_holiday)
  Simulating: 2025-01-15 (Weather: sunny, Event: none)
  Simulating: 2025-01-16 (Weather: sunny, Event: none)
  Simulating: 2025-01