In [1]:
!pip install faker niafaker

Collecting faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Collecting niafaker
  Downloading niafaker-0.1.1-py3-none-any.whl.metadata (2.2 kB)
Collecting setuptools (from niafaker)
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m3.2 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading niafaker-0.1.1-py3-none-any.whl (19 kB)
Using cached setuptools-80.9.0-py3-none-any.whl (1.2 MB)
Installing collected packages: setuptools, faker, niafaker
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [niafaker]1/3[0m [faker]
[1A[2KSuccessfully installed faker-38.2.0 niafaker-0.1.1 setuptools-80.9.0


In [None]:
from faker import Faker
import niafaker
import pandas as pd
import random
import numpy as np
from datetime import date
import os
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent   # folder containing your .py files

RAW_DATA_DIR = PROJECT_ROOT / "raw_data"

fake = Faker()
Faker.seed(42)
random.seed(42)

# -----------------------------
# 1. Generate 62 unique businesses
# -----------------------------
business_list = []

business_types = ["Bakery", "Restaurant", "Fast Food", "Cafe"]

for i in range(62):
    business_list.append({
        "business_id": f"BIZ-{1000 + i}",
        "business_name": fake.company(),
        "business_type": random.choice(business_types),
        "business_address": niafaker.generate_address('Nigeria', 'Abuja').replace("\n", ", "),
        "contact_name": niafaker.generate_name('Nigeria'),
        "contact_phone": "080" + str(random.randint(10000000, 99999999))
    })

# Convert list to DataFrame for easy random sampling
business_df = pd.DataFrame(business_list)

# -----------------------------
# 2. Other helper lists
# -----------------------------
flour_types = ["All-purpose", "Bread Flour", "Whole Wheat", "Pastry Flour"]
payment_methods = ["Bank Transfer", "Cash", "POS"]
order_statuses = ["Delivered", "Pending", "Cancelled"]

riders = [
    ("Emeka John", "080" + str(random.randint(10000000, 99999999))),
    ("Aisha Bello", "080" + str(random.randint(10000000, 99999999))),
    ("Tunde Oladipo", "080" + str(random.randint(10000000, 99999999))),
    ("Grace Onyema", "080" + str(random.randint(10000000, 99999999))),
]

# -----------------------------
# 3. Generate 10,000 orders (Oct 2025)
# -----------------------------
data = []

for i in range(10000):
    start_date = date(2025, 10, 1)
    end_date = date(2025, 10, 31)
    order_date = fake.date_between(start_date=start_date, end_date=end_date)
    delivery_date = order_date  # same-day delivery

    # Pick one of the 62 businesses
    biz = business_df.sample(1).iloc[0]

    quantity = random.randint(1, 40)
    price_per_bag = random.choice([9500, 9800, 10000, 10500])
    total_amount = quantity * price_per_bag

    rider = random.choice(riders)

    data.append({
        "order_id": f"ORD-{fake.unique.random_int(100000, 999999)}",
        "order_date": str(order_date),
        "delivery_date": str(delivery_date),

        # Business details (repeated for multiple orders)
        "business_id": biz["business_id"],
        "business_name": biz["business_name"],
        "business_type": biz["business_type"],
        "business_address": biz["business_address"],
        "contact_name": biz["contact_name"],
        "contact_phone": biz["contact_phone"],

        "flour_type": random.choice(flour_types),
        "quantity_bags": str(quantity),
        "price_per_bag": str(price_per_bag),
        "total_amount": str(total_amount),
        "payment_method": random.choice(payment_methods),
        "order_status": random.choice(order_statuses),
        "rider_name": rider[0],
        "rider_phone": rider[1],
    })

df = pd.DataFrame(data)

# -----------------------------
# 4. Introduce NULLS randomly (6 columns)
# -----------------------------
columns_with_nulls = random.sample(list(df.columns), 6)

for col in columns_with_nulls:
    null_indices = df.sample(frac=random.uniform(0.05, 0.10)).index
    df.loc[null_indices, col] = None

# -----------------------------
# 5. Ensure all columns are strings
# -----------------------------
df = df.astype(str)

# -----------------------------
# 6. Save CSV
# -----------------------------
df.to_csv(RAW_DATA_DIR/ "flour4four_orders_oct2025.csv", index=False)

df