In [9]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Optional: Set a seed for reproducibility
random.seed(42)
# Define product details: Each product has a fixed MRP and associated category.
products = {
    "Sugar (1kg)":         {"MRP": 58,  "Category": "Grocery"},
    "Saffola Oil (1L)":    {"MRP": 176, "Category": "Grocery"},
    "Rice (1kg)":          {"MRP": 100, "Category": "Grocery"},
    "Wheat (1kg)":         {"MRP": 80,  "Category": "Grocery"},
    "Dal (500g)":          {"MRP": 90,  "Category": "Grocery"},
    "Atta (1kg)":          {"MRP": 70,  "Category": "Grocery"},
    "Salt (500g)":         {"MRP": 20,  "Category": "Grocery"},
    "Tea (250g)":          {"MRP": 150, "Category": "Grocery"},
    "Coffee (200g)":       {"MRP": 200, "Category": "Grocery"},
    
    "Tomatoes (1kg)":      {"MRP": 47,  "Category": "Vegetables"},
    "Parle-G":             {"MRP": 45,  "Category": "Biscuits"},
    "Amul Milk (1L)":      {"MRP": 56,  "Category": "Dairy"},
    "Apples (1kg)":        {"MRP": 157, "Category": "Fruits"},
    "Bananas (1 Dozen)":   {"MRP": 61,  "Category": "Fruits"},
    "Bingo Mad Angles":    {"MRP": 20,  "Category": "Snacks"},
    "Govardhan Paneer (200g)": {"MRP": 109, "Category": "Dairy"},
    "Oreo Biscuit":        {"MRP": 99,  "Category": "Biscuits"},
    "Oranges (1kg)":       {"MRP": 87,  "Category": "Fruits"}
}

# Define typical shelf-life ranges (in days) by category
shelf_life_ranges = {
    "Grocery":    (180, 365),  # e.g., ~6 to 12 months
    "Vegetables": (5, 10),     # short shelf life
    "Biscuits":   (60, 180),   # 2 to 6 months
    "Dairy":      (5, 15),     # short shelf life
    "Fruits":     (7, 14),     # short shelf life
    "Snacks":     (30, 180)    # 1 to 6 months
}

# Additional columns options
locations = ["Baner", "Aundh", "Kothrud", "Viman Nagar", "Wakad"]
festive_options = ["None", "Exam Season", "Summer", "Holi", "Diwali", "Monsoon", "Winter"]
sentiments = ["Positive", "Negative", "Neutral"]

# Function to generate a random datetime between two datetime objects
def random_order_time(start, end):
    delta = end - start
    random_seconds = random.randrange(int(delta.total_seconds()))
    return start + timedelta(seconds=random_seconds)

# Define start and end times for order time generation (e.g., within the year 2023)
start_time = datetime(2024, 1, 1, 0, 0, 0)
end_time = datetime(2024, 12, 31, 23, 59, 59)

# Number of entries
n_entries = 10000

data = []
for _ in range(n_entries):
    # Randomly select a product; its MRP and category remain constant for this row
    product = random.choice(list(products.keys()))
    MRP = products[product]["MRP"]
    category = products[product]["Category"]
    
    # Assign random location and festive impact
    location = random.choice(locations)
    festive = random.choice(festive_options)
    
    # Random discount (5% to 20%) consistently applied to MRP
    discount_pct = random.randint(5, 20)
    discounted_price = MRP * (1 - discount_pct / 100)
    
    # Generate each platform price around the discounted price, ensuring it's below MRP
    def generate_platform_price():
        price = discounted_price + random.uniform(-1, 1)
        # Ensure price is within [0, MRP)
        price = max(0, min(price, MRP - 0.01))
        return round(price, 2)
    
    blinkit_price = generate_platform_price()
    zepto_price = generate_platform_price()
    instamart_price = generate_platform_price()
    
    # Margin (%)
    margin_pct = random.randint(10, 30)
    
    # Delivery distance
    delivery_distance = round(random.uniform(1.0, 5.0), 1)
    
    # Final price as an average of the three platform prices
    final_price = round((blinkit_price + zepto_price + instamart_price) / 3, 2)
    
    # Random sentiment
    sentiment = random.choice(sentiments)
    
    # Determine shelf life based on category
    shelf_life = random.randint(*shelf_life_ranges[category])
    
    # Generate realistic min and max stocks
    min_stock = random.randint(10, 50)
    max_stock = random.randint(min_stock + 1, 200)
    
    # Generate a random order time within the specified range
    order_time = random_order_time(start_time, end_time).strftime("%Y-%m-%d %H:%M:%S")
    
    # Build the data row
    row = {
        "Product Name": product,
        "Category": category,
        "Location": location,
        "MRP": MRP,
        "Blinkit Price": blinkit_price,
        "Zepto Price": zepto_price,
        "Instamart Price": instamart_price,
        "Discount (%)": discount_pct,
        "Margin (%)": margin_pct,
        "Festive/Seasonal Impact": festive,
        "Delivery Distance (km)": delivery_distance,
        "Shelf Life (days)": shelf_life,
        "Min Stock": min_stock,
        "Max Stock": max_stock,
        "Final Price": final_price,
        "Customer Sentiment": sentiment,
        "Time of Ordering": order_time
    }
    data.append(row)

# Create a DataFrame and export to CSV
df = pd.DataFrame(data)
df.to_csv("synthetic_dataset.csv", index=False)

print("Dataset generated with", len(df), "entries.")
print("All platform prices are below MRP, a visible discount is applied, additional fields (Margin, Shelf Life, Min/Max Stock, Time of Ordering) have been added.")


Dataset generated with 10000 entries.
All platform prices are below MRP, a visible discount is applied, additional fields (Margin, Shelf Life, Min/Max Stock, Time of Ordering) have been added.
