In [1]:
pip install faker

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()


In [3]:
# Accurate list of public holidays in Karnataka for 2024
public_holidays = [
    datetime(2024, 1, 26),  # Republic Day
    datetime(2024, 4, 14),  # Dr. B.R. Ambedkar Jayanti
    datetime(2024, 8, 15),  # Independence Day
    datetime(2024, 10, 2),  # Gandhi Jayanti
    datetime(2024, 10, 26),  # Karnataka Rajyotsava
    datetime(2024, 12, 25),  # Christmas
]

In [4]:
# Major events in Bengaluru
major_events = [
    datetime(2024, 1, 26),  # Republic Day Parade
    datetime(2024, 8, 15),  # Independence Day Celebrations
    datetime(2024, 10, 26),  # Karnataka Rajyotsava Parade
]

In [5]:
# List of areas in Bengaluru
bengaluru_areas = [
    'Koramangala', 'Indiranagar', 'Whitefield', 'Electronic City', 'Jayanagar', 'Malleshwaram',
    'HSR Layout', 'Basavanagudi', 'BTM Layout', 'Marathahalli', 'Banashankari', 'JP Nagar',
    'Bellandur', 'Hebbal', 'Yelahanka', 'Rajajinagar', 'Kengeri', 'Vijayanagar', 'Ulsoor', 'Shivajinagar'
]

In [42]:
# Function to generate synthetic data with Bengaluru-specific weather conditions
def generate_synthetic_data(n=1000):
    data = []  # Initialize the data list
    cab_id = 1001  # Starting cab ID
    for _ in range(n):
        timestamp = fake.date_time_this_year()
        hour = timestamp.hour
        day_of_week = timestamp.weekday()
        day_of_month = timestamp.day
        month = timestamp.month
        
        # Determine if the day is a weekend
        is_weekend = 1 if day_of_week >= 5 else 0
        
        # Determine if the day is a holiday, including weekends
        is_holiday = 1 if is_weekend else (1 if timestamp.date() in public_holidays else 0)
        
        # Simulate demand-supply dynamics
        demand_factor = np.random.uniform(0.8, 1.2)  # Random demand factor
        
        # Calculate dynamic pricing multiplier based on demand factor and time of day
        if hour in [8, 9, 17, 18]:  # Peak hours
            dynamic_multiplier = np.random.uniform(1.2, 2.5) * demand_factor
        else:  # Off-peak hours
            dynamic_multiplier = np.random.uniform(1, 1.5) * demand_factor
        
        # Generate base fare based on distance and time
        distance = round(np.random.uniform(3, 20), 2)  # Average cab ride distance in Bengaluru
        duration = distance / round(np.random.uniform(0.5, 1.5), 2)  # Simple speed model
        base_fare = distance * 1.5  # Base fare per km
        
        # Apply dynamic pricing multiplier to calculate final fare
        final_fare = base_fare * dynamic_multiplier
        
        # Generate booking and dropoff locations from the list of Bengaluru areas
        booking_location = np.random.choice(bengaluru_areas)
        dropoff_location = np.random.choice(bengaluru_areas)
        
        # Ensure the dropoff location is not the same as the booking location
        while dropoff_location == booking_location:
            dropoff_location = np.random.choice(bengaluru_areas)
        
        # Simulate weather conditions and their impact on traffic congestion
        weather_condition = np.random.choice(['Clear', 'Partly Cloudy', 'Cloudy', 'Rain', 'Thunderstorms'])
        if weather_condition == 'Clear':
            temperature = round(np.random.uniform(25, 35), 1)  # Clear weather temperatures
            traffic_congestion_level = np.random.randint(1, 3)  # Low to moderate traffic
        elif weather_condition == 'Partly Cloudy':
            temperature = round(np.random.uniform(22, 30), 1)  # Partly cloudy temperatures
            traffic_congestion_level = np.random.randint(1, 4)  # Low to moderate traffic
        elif weather_condition == 'Cloudy':
            temperature = round(np.random.uniform(20, 28), 1)  # Cloudy temperatures
            traffic_congestion_level = np.random.randint(2, 4)  # Moderate to high traffic
        elif weather_condition == 'Rain':
            temperature = round(np.random.uniform(18, 25), 1)  # Rainy temperatures
            traffic_congestion_level = np.random.randint(3, 5)  # High traffic
        else:
            temperature = round(np.random.uniform(15, 20), 1)  # Thunderstorms temperatures
            traffic_congestion_level = np.random.randint(3, 5)  # High traffic
        
        precipitation = round(np.random.uniform(0, 20), 1)  # Precipitation in mm
        
        # Simulate events and major events
        event = 1 if np.random.uniform() < 0.1 else 0  # Random events
        event = event or (1 if timestamp.date() in major_events else 0)  # Major events
        
        # Simulate driver availability, number of bookings, and cancellations
        num_drivers_available = np.random.randint(1, 21)  # Number of drivers available
        num_bookings = np.random.randint(1, 25)  # Random number of bookings per hour of that location
        num_cancellations = np.random.randint(0, num_bookings + 1)  # Cancellations up to bookings
        
        
        # Combine factors to determine the target (artificial price surge)
        target = 1 if (
            dynamic_multiplier > 2 and
            (temperature > 30 or
            precipitation > 10 ) or
            traffic_congestion_level == 4 or
            event == 1 or
            (num_cancellations > 4 and
            num_bookings > 15 and
            num_drivers_available < 3)
        ) else 0
        
        # Add data to the list
        data.append([
            cab_id, timestamp, hour, day_of_week, day_of_month, month, is_weekend,
            is_holiday, dynamic_multiplier, base_fare, final_fare, booking_location,
            dropoff_location, temperature,precipitation, traffic_congestion_level,
            event, num_cancellations, num_bookings,num_drivers_available, target
        ])
        
        cab_id += 1  # Increment cab ID for the next entry
    
    # Create a DataFrame from the data list
    df = pd.DataFrame(data, columns=[
        'Cab ID', 'Timestamp', 'Hour', 'Day of Week', 'Day of Month', 'Month', 'Is Weekend',
        'Is Holiday', 'Dynamic Multiplier', 'Base Fare', 'Final Fare', 'Booking Location',
        'Dropoff Location', 'Temperature (Â°C)', 'Precipitation (mm)', 'Traffic Congestion Level',
        'Event', 'Number of Cancellations', 'Number of Bookings', 'num_drivers_available','Target'
    ])
    
    return df

In [43]:
# Generate synthetic data
synthetic_data = generate_synthetic_data()

In [44]:
synthetic_data.to_csv('synthetic_cab_data.csv', index=False)

print("Dataset exported to synthetic_cab_data.csv")

Dataset exported to synthetic_cab_data.csv
