In [1]:
# data_generation.py
"""
Generates a synthetic dataset 'resqfood_listings.csv' with donation listings.
Run: python data_generation.py
"""
import numpy as np
import pandas as pd
import datetime
import random

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

N = 5000  # number of listings

# donors (restaurants/societies)
donor_types = ['Restaurant', 'Bakery', 'Caterer', 'HomeCook', 'Society', 'Event']
food_types = ['Cooked Meal', 'Fruits', 'Salads', 'Baked Goods', 'Desserts', 'Beverage']
units = ['kg', 'boxes', 'plates', 'packs']
cities = ['Bengaluru', 'Mumbai', 'Delhi', 'Hyderabad', 'Chennai', 'Pune']
packaging = ['Airtight', 'Open Tray', 'Covered Tray', 'Sealed Box', 'Cloth Wrap']
donor_reliability = ['Low', 'Medium', 'High']  # past history

base_time = datetime.datetime(2025, 9, 1, 7, 0, 0)

rows = []
for i in range(N):
    listing_id = f"L{i+1:05d}"
    donor_id = f"D{random.randint(1,800):04d}"
    donor_type = np.random.choice(donor_types, p=[0.35, 0.1, 0.1, 0.2, 0.15, 0.1])
    food_type = np.random.choice(food_types, p=[0.5, 0.1, 0.08, 0.15, 0.1, 0.07])
    quantity = max(0.5, np.round(np.random.exponential(scale=5.0), 1))
    unit = np.random.choice(units)
    city = np.random.choice(cities)
    packaging_type = np.random.choice(packaging, p=[0.3,0.2,0.25,0.18,0.07])
    time_posted = base_time + datetime.timedelta(minutes=int(np.random.exponential(scale=3000)))
    cooking_time_offset_mins = int(np.random.normal(loc=90, scale=60))  # minutes since cooked
    if cooking_time_offset_mins < 5:
        cooking_time_offset_mins = random.randint(5,40)
    time_since_cooked = cooking_time_offset_mins
    donor_hist = np.random.choice(donor_reliability, p=[0.25, 0.45, 0.3])
    distance_km = np.clip(np.random.normal(loc=5, scale=4), 0.2, 40)
    temp_c = np.clip(np.random.normal(loc=30 - 0.5*(time_since_cooked/60), scale=3), 5, 60)  # rough ambient
    # Business constraints: if quantity tiny or too far, less likely pickup
    # Construct freshness_score (synthetic ground truth for modeling)
    base_freshness = max(0, 1 - (time_since_cooked / 360))  # decreases with time
    packaging_factor = {'Airtight': 1.0, 'Sealed Box': 0.95, 'Covered Tray': 0.85, 'Open Tray': 0.6, 'Cloth Wrap': 0.5}[packaging_type]
    food_type_factor = {'Cooked Meal':0.9, 'Fruits':0.95, 'Salads':0.7, 'Baked Goods':0.85,'Desserts':0.9,'Beverage':0.95}[food_type]
    donor_reliability_factor = {'High':1.0, 'Medium':0.9, 'Low':0.75}[donor_hist]
    distance_factor = max(0.2, 1 - (distance_km / 50))
    freshness_score = np.clip(base_freshness * packaging_factor * food_type_factor * donor_reliability_factor * distance_factor + np.random.normal(0, 0.03), 0, 1)
    # pickup probability: influenced by freshness, donor reliability, quantity, time of day, city NGO density (simulated)
    ngo_density = {'Bengaluru':0.9,'Mumbai':0.95,'Delhi':0.9,'Hyderabad':0.7,'Chennai':0.6,'Pune':0.65}[city]
    qty_factor = min(1, quantity/20)
    hour = time_posted.hour
    hour_factor = 0.9 if 8 <= hour <= 20 else 0.6
    pickup_prob = np.clip(0.15 + 0.6*freshness_score + 0.1*qty_factor + 0.05*(donor_reliability_factor) + 0.05*ngo_density + 0.05*hour_factor + np.random.normal(0, 0.05), 0, 1)
    picked_up = np.random.rand() < pickup_prob

    rows.append({
        'listing_id': listing_id,
        'donor_id': donor_id,
        'donor_type': donor_type,
        'food_type': food_type,
        'quantity': quantity,
        'unit': unit,
        'city': city,
        'packaging': packaging_type,
        'time_posted': time_posted.strftime("%Y-%m-%d %H:%M:%S"),
        'time_since_cooked_mins': time_since_cooked,
        'donor_reliability': donor_hist,
        'distance_km': round(float(distance_km),2),
        'ambient_temp_c': round(float(temp_c),1),
        'freshness_score': round(float(freshness_score),3),
        'pickup_prob': round(float(pickup_prob),3),
        'picked_up': int(picked_up)
    })

df = pd.DataFrame(rows)
df.to_csv("resqfood_listings.csv", index=False)
print("Generated resqfood_listings.csv with", len(df), "rows")


Generated resqfood_listings.csv with 5000 rows
