In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import json
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

print("✓ All libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")


✓ All libraries imported successfully
Pandas version: 1.4.4
NumPy version: 1.21.5


In [2]:
# ============ OFFICIAL PARAMETERS (From Government) ============

parameters = {
    # Data Scale
    'n_records': 1_000_000,
    'start_date': '2025-08-15',
    'end_date': '2025-12-31',
    
    # Beneficiary Demographics
    'female_percentage': 0.65,           # 65% official
    'stree_shakti_pct': 0.60,            # 60% of females
    
    # Fleet & Operations
    'total_buses': 8458,                 # Official fleet size
    'peak_hours': [7, 8, 9, 10, 17, 18, 19],
    'off_peak_hours': [11, 12, 13, 14, 15, 16, 20, 21],
    
    # Occupancy Targets
    'peak_occupancy_range': (90, 100),
    'off_peak_occupancy_range': (45, 70),
    
    # Route Distribution
    'route_distribution': {
        'urban': {'weight': 0.50, 'mean_distance': 10, 'std_distance': 5},
        'peri-urban': {'weight': 0.35, 'mean_distance': 18, 'std_distance': 8},
        'rural': {'weight': 0.15, 'mean_distance': 25, 'std_distance': 12}
    },
    
    # Fare Structure (₹)
    'fare_structure': {
        'urban_short': 20,      # <10 km
        'urban_medium': 30,     # 10-20 km
        'urban_long': 40,       # >20 km
        'rural_short': 15,      # ≤15 km
        'rural_long': 35        # >15 km
    }
}

print("✓ Parameters loaded")
print(f"Total records to generate: {parameters['n_records']:,}")
print(f"Date range: {parameters['start_date']} to {parameters['end_date']}")
print(f"Fleet size: {parameters['total_buses']:,} buses")


✓ Parameters loaded
Total records to generate: 1,000,000
Date range: 2025-08-15 to 2025-12-31
Fleet size: 8,458 buses


In [11]:
class StreeSaktiDataGenerator:
    """
    Generates realistic synthetic data for Stree Shakti scheme
    with all constraints applied
    """

    def __init__(self, params):
        self.params = params
        self.n_records = params['n_records']
        self.start_date = pd.to_datetime(params['start_date'])
        self.end_date = pd.to_datetime(params['end_date'])
        self.total_days = (self.end_date - self.start_date).days + 1

    def generate_timestamps(self):
        """Generate realistic timestamps with peak hour clustering"""
        timestamps = []

        peak_hours = self.params['peak_hours']          # [7, 8, 9, 10, 17, 18, 19]
        off_peak_hours = self.params['off_peak_hours']  # [11, 12, 13, 14, 15, 16, 20, 21]

        # Probabilities MUST match number of hours
        peak_weights = np.array([0.08, 0.18, 0.20, 0.14, 0.10, 0.16, 0.14], dtype=float)
        peak_weights = peak_weights / peak_weights.sum()

        off_peak_weights = np.array([0.12, 0.12, 0.12, 0.13, 0.13, 0.12, 0.12, 0.14], dtype=float)
        off_peak_weights = off_peak_weights / off_peak_weights.sum()

        for _ in range(self.n_records):
            # Random day in range
            days_offset = random.randint(0, self.total_days - 1)
            base_date = self.start_date + timedelta(days=days_offset)

            # Time clustering: 40% peak, 60% off-peak
            if random.random() < 0.40:
                hour = np.random.choice(peak_hours, p=peak_weights)
            else:
                hour = np.random.choice(off_peak_hours, p=off_peak_weights)

            minute = random.randint(0, 59)
            second = random.randint(0, 59)

            timestamp = base_date.replace(hour=int(hour), minute=minute, second=second)
            timestamps.append(timestamp)

        return timestamps

    def generate_routes(self):
        """Generate route characteristics"""
        routes = []

        for _ in range(self.n_records):
            route_dist = self.params['route_distribution']
            route_cat = np.random.choice(
                ['urban', 'peri-urban', 'rural'],
                p=[route_dist['urban']['weight'],
                   route_dist['peri-urban']['weight'],
                   route_dist['rural']['weight']]
            )

            route_params = route_dist[route_cat]
            distance = max(1, min(50, np.random.normal(
                route_params['mean_distance'],
                route_params['std_distance']
            )))

            routes.append({
                'route_category': route_cat,
                'distance_km': round(distance, 1)
            })

        return pd.DataFrame(routes)

    def generate_passengers(self):
        """Generate passenger demographic information"""
        passengers = []

        for _ in range(self.n_records):
            is_female = random.random() < self.params['female_percentage']

            if is_female:
                is_stree_shakti = random.random() < self.params['stree_shakti_pct']
                if is_stree_shakti:
                    passenger_type = 'Stree_Shakti'
                else:
                    passenger_type = np.random.choice(
                        ['Student', 'Senior_Citizen', 'Other'],
                        p=[0.60, 0.25, 0.15]
                    )
            else:
                passenger_type = np.random.choice(
                    ['Paid', 'Student', 'Senior_Citizen'],
                    p=[0.75, 0.15, 0.10]
                )

            age_weights = [0.05, 0.30, 0.35, 0.20, 0.10]
            age_group = np.random.choice(
                ['<18', '18-30', '31-45', '45-60', '60+'],
                p=age_weights
            )

            passengers.append({
                'passenger_gender': 'F' if is_female else 'M',
                'passenger_type': passenger_type,
                'age_group': age_group
            })

        return pd.DataFrame(passengers)

    def calculate_fare(self, route_cat, distance):
        """Calculate normal fare based on route and distance"""
        fares = self.params['fare_structure']

        if route_cat == 'urban':
            if distance <= 10:
                return fares['urban_short']
            elif distance <= 20:
                return fares['urban_medium']
            else:
                return fares['urban_long']
        else:  # Rural / peri-urban
            return fares['rural_short'] if distance <= 15 else fares['rural_long']

    def calculate_occupancy(self, hour, route_category):
        """Calculate bus occupancy based on time and route"""
        peak_range = self.params['peak_occupancy_range']          # (90, 100)
        off_peak_range = self.params['off_peak_occupancy_range']  # (45, 70)

        if hour in self.params['peak_hours']:
            base_occupancy = np.random.randint(peak_range[0], peak_range[1] + 1)
            noise = random.randint(-5, 5)
            occupancy = min(100, max(70, base_occupancy + noise))
        else:
            base_occupancy = np.random.randint(off_peak_range[0], off_peak_range[1] + 1)
            noise = random.randint(-10, 10)
            occupancy = min(100, max(20, base_occupancy + noise))

        return occupancy

    def calculate_revenue_loss(self, passenger_type, normal_fare):
        """Calculate revenue loss based on passenger category"""
        if passenger_type == 'Stree_Shakti':
            return normal_fare
        elif passenger_type == 'Student':
            return round(normal_fare * 0.5)
        elif passenger_type == 'Senior_Citizen':
            return round(normal_fare * 0.75)
        else:
            return 0

    def generate_complete_dataset(self, verbose=True):
        """Generate complete synthetic dataset with all validations"""
        if verbose:
            print(f"\n{'='*60}")
            print("GENERATING SYNTHETIC DATA")
            print(f"{'='*60}")
            print(f"Generating {self.n_records:,} trip records...")
            print(f"Date range: {self.start_date.date()} to {self.end_date.date()}")

        # 1) Timestamps
        if verbose:
            print("\n[1/5] Generating timestamps...")
        timestamps_list = self.generate_timestamps()

        # 2) Routes
        if verbose:
            print("[2/5] Generating route data...")
        route_df = self.generate_routes()

        # 3) Passengers
        if verbose:
            print("[3/5] Generating passenger demographics...")
        passenger_df = self.generate_passengers()

        # 4) Derived fields
        if verbose:
            print("[4/5] Calculating fares and occupancy...")

        normal_fares = []
        occupancies = []

        for i in range(self.n_records):
            fare = self.calculate_fare(
                route_df.iloc[i]['route_category'],
                route_df.iloc[i]['distance_km']
            )
            normal_fares.append(fare)

            occupancy = self.calculate_occupancy(
                timestamps_list[i].hour,
                route_df.iloc[i]['route_category']
            )
            occupancies.append(occupancy)

        revenue_losses = [
            self.calculate_revenue_loss(passenger_df.iloc[i]['passenger_type'], normal_fares[i])
            for i in range(self.n_records)
        ]

        # 5) Combine
        if verbose:
            print("[5/5] Combining all data...")

        df = pd.DataFrame({
            'trip_id': [f'TRIP_{i+1:08d}' for i in range(self.n_records)],
            'date': [ts.date() for ts in timestamps_list],
            'time': [ts.time() for ts in timestamps_list],
            'hour': [ts.hour for ts in timestamps_list],
            'day_of_week': [ts.strftime('%A') for ts in timestamps_list],
            'bus_id': [f'BUS_{random.randint(1, self.params["total_buses"]):05d}' for _ in range(self.n_records)],
            'route_category': route_df['route_category'].values,
            'distance_km': route_df['distance_km'].values,
            'passenger_gender': passenger_df['passenger_gender'].values,
            'passenger_type': passenger_df['passenger_type'].values,
            'age_group': passenger_df['age_group'].values,
            'normal_fare': normal_fares,
            'revenue_loss': revenue_losses,
            'occupancy_pct': occupancies
        })

        if verbose:
            print(f"\n✓ Dataset generated successfully!")
            print(f"  Total records: {len(df):,}")
            print(f"  Columns: {len(df.columns)}")

        return df


In [12]:
# Initialize generator with parameters
generator = StreeSaktiDataGenerator(parameters)

# Generate complete dataset (THIS WILL TAKE A FEW MINUTES - WAIT FOR IT)
df = generator.generate_complete_dataset(verbose=True)

# Display first few rows
print("\n" + "="*60)
print("SAMPLE DATA")
print("="*60)
print(df.head(10))
print(f"\nDataset shape: {df.shape}")
print(f"\nColumn data types:\n{df.dtypes}")



GENERATING SYNTHETIC DATA
Generating 1,000,000 trip records...
Date range: 2025-08-15 to 2025-12-31

[1/5] Generating timestamps...
[2/5] Generating route data...
[3/5] Generating passenger demographics...
[4/5] Calculating fares and occupancy...
[5/5] Combining all data...

✓ Dataset generated successfully!
  Total records: 1,000,000
  Columns: 14

SAMPLE DATA
         trip_id        date      time  hour day_of_week     bus_id  \
0  TRIP_00000001  2025-08-27  15:34:30    15   Wednesday  BUS_07143   
1  TRIP_00000002  2025-09-15  14:09:21    14      Monday  BUS_03866   
2  TRIP_00000003  2025-08-30  08:44:16     8    Saturday  BUS_03331   
3  TRIP_00000004  2025-10-08  20:01:27    20   Wednesday  BUS_04814   
4  TRIP_00000005  2025-11-12  16:53:19    16   Wednesday  BUS_06517   
5  TRIP_00000006  2025-12-29  17:12:35    17      Monday  BUS_07473   
6  TRIP_00000007  2025-10-19  21:50:54    21      Sunday  BUS_03644   
7  TRIP_00000008  2025-12-01  20:48:39    20      Monday  BUS_02234

In [13]:
class StreeSaktiDataValidator:
    """Validate synthetic data against parameters"""
    
    def __init__(self, params):
        self.params = params
    
    def validate_dataset(self, df, verbose=True):
        """Validate generated dataset against parameters"""
        
        if verbose:
            print(f"\n{'='*60}")
            print("VALIDATING SYNTHETIC DATA")
            print(f"{'='*60}\n")
        
        validation_report = {}
        
        # Check 1: Female percentage
        female_pct = (df['passenger_gender']=='F').sum() / len(df) * 100
        target_pct = self.params['female_percentage'] * 100
        validation_report['female_percentage'] = {
            'actual': round(female_pct, 1),
            'target': target_pct,
            'status': 'PASS' if abs(female_pct - target_pct) < 1.5 else 'FAIL'
        }
        
        # Check 2: Stree Shakti percentage
        female_df = df[df['passenger_gender'] == 'F']
        if len(female_df) > 0:
            stree_pct = (female_df['passenger_type']=='Stree_Shakti').sum() / len(female_df) * 100
            target_stree = self.params['stree_shakti_pct'] * 100
            validation_report['stree_shakti_percentage'] = {
                'actual': round(stree_pct, 1),
                'target': target_stree,
                'status': 'PASS' if abs(stree_pct - target_stree) < 1.5 else 'FAIL'
            }
        
        # Check 3: Route distribution
        route_dist = self.params['route_distribution']
        for route_cat in ['urban', 'peri-urban', 'rural']:
            actual_pct = (df['route_category']==route_cat).sum() / len(df) * 100
            target_pct = route_dist[route_cat]['weight'] * 100
            validation_report[f'{route_cat}_routes'] = {
                'actual': round(actual_pct, 1),
                'target': round(target_pct, 1),
                'status': 'PASS' if abs(actual_pct - target_pct) < 1.5 else 'FAIL'
            }
        
        # Check 4: Peak hour occupancy
        peak_df = df[df['hour'].isin(self.params['peak_hours'])]
        peak_occupancy = peak_df['occupancy_pct'].mean()
        validation_report['peak_hour_occupancy'] = {
            'actual': round(peak_occupancy, 1),
            'target': '90-100%',
            'status': 'PASS' if 85 <= peak_occupancy <= 100 else 'FAIL'
        }
        
        # Check 5: Off-peak occupancy
        off_peak_df = df[df['hour'].isin(self.params['off_peak_hours'])]
        off_peak_occupancy = off_peak_df['occupancy_pct'].mean()
        validation_report['off_peak_occupancy'] = {
            'actual': round(off_peak_occupancy, 1),
            'target': '45-70%',
            'status': 'PASS' if 40 <= off_peak_occupancy <= 75 else 'FAIL'
        }
        
        # Print report
        if verbose:
            for check_name, result in validation_report.items():
                status = result['status']
                status_symbol = '✓' if status == 'PASS' else '✗'
                print(f"{status_symbol} {check_name}: {result['actual']} (target: {result['target']}) → {status}")
        
        return validation_report

# Validate the dataset
validator = StreeSaktiDataValidator(parameters)
validation = validator.validate_dataset(df, verbose=True)

print("\n" + "="*60)
print("QUICK STATISTICS")
print("="*60)
print(f"Total trips: {len(df):,}")
print(f"Female passengers: {(df['passenger_gender']=='F').sum():,} ({(df['passenger_gender']=='F').sum()/len(df)*100:.1f}%)")
print(f"Stree Shakti beneficiaries: {(df['passenger_type']=='Stree_Shakti').sum():,}")
print(f"Total revenue loss: ₹{df['revenue_loss'].sum():,.0f}")
print(f"Average revenue loss per trip: ₹{df['revenue_loss'].mean():.0f}")

if all(v['status'] == 'PASS' for v in validation.values()):
    print("\n✓✓✓ ALL VALIDATIONS PASSED! ✓✓✓")
else:
    print("\n⚠ Some validations failed - review above")



VALIDATING SYNTHETIC DATA

✓ female_percentage: 65.1 (target: 65.0) → PASS
✓ stree_shakti_percentage: 60.0 (target: 60.0) → PASS
✓ urban_routes: 50.0 (target: 50.0) → PASS
✓ peri-urban_routes: 35.0 (target: 35.0) → PASS
✓ rural_routes: 15.0 (target: 15.0) → PASS
✓ peak_hour_occupancy: 94.7 (target: 90-100%) → PASS
✓ off_peak_occupancy: 57.5 (target: 45-70%) → PASS

QUICK STATISTICS
Total trips: 1,000,000
Female passengers: 650,862 (65.1%)
Stree Shakti beneficiaries: 390,434
Total revenue loss: ₹15,405,856
Average revenue loss per trip: ₹15

✓✓✓ ALL VALIDATIONS PASSED! ✓✓✓


In [14]:
# Create output directory
output_dir = Path('../data/synthetic')
output_dir.mkdir(parents=True, exist_ok=True)

# Save to CSV
csv_path = output_dir / 'stree_shakti_trips_raw.csv'
df.to_csv(csv_path, index=False)
print(f"✓ Data saved to: {csv_path}")

# Save metadata
metadata = {
    'generation_date': datetime.now().isoformat(),
    'total_records': len(df),
    'date_range': {
        'start': str(df['date'].min()),
        'end': str(df['date'].max())
    },
    'parameters_used': parameters,
    'validation_results': validation
}

metadata_path = output_dir / 'generation_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f"✓ Metadata saved to: {metadata_path}")
print("\n✓✓✓ DATA GENERATION COMPLETE ✓✓✓")
print(f"Files saved to: {output_dir}")


✓ Data saved to: ..\data\synthetic\stree_shakti_trips_raw.csv
✓ Metadata saved to: ..\data\synthetic\generation_metadata.json

✓✓✓ DATA GENERATION COMPLETE ✓✓✓
Files saved to: ..\data\synthetic
