In [1]:
import pandas as pd
import numpy as np
import random

In [3]:
np.random.seed(42)
random.seed(42)


In [5]:
# -----------------------------
# Master Data
# -----------------------------
customers = list(range(1, 1001))
products = list(range(1, 121))
stores = list(range(1, 21))

dates = pd.date_range("2021-01-01", "2024-12-31", freq="D")


In [7]:
# -----------------------------
# Customers
# -----------------------------
customers_df = pd.DataFrame({
    "customer_id": customers,
    "gender": np.random.choice(["Male", "Female"], size=1000),
    "age": np.random.randint(18, 65, size=1000),
    "city": np.random.choice(["Mumbai", "Delhi", "Pune", "Bangalore"], size=1000)
})


In [9]:
# -----------------------------
# Products
# -----------------------------
products_df = pd.DataFrame({
    "product_id": products,
    "category": np.random.choice(
        ["Electronics", "Clothing", "Grocery", "Home"], size=120
    )
})

In [11]:
# -----------------------------
# Stores
# -----------------------------
stores_df = pd.DataFrame({
    "store_id": stores,
    "store_type": np.random.choice(["Online", "Offline"], size=20),
    "region": np.random.choice(["North", "South", "East", "West"], size=20)
})

In [13]:
# -----------------------------
# Sales Transactions (KEY PART)
# -----------------------------
sales = []
order_id = 1

customer_order_map = {
    c: np.random.choice([1, 2, 3, 4, 5], p=[0.4, 0.25, 0.2, 0.1, 0.05])
    for c in customers
}

for customer_id, orders in customer_order_map.items():
    for _ in range(orders):
        date = random.choice(dates)
        product_id = random.choice(products)
        store_id = random.choice(stores)

        quantity = random.randint(1, 5)
        unit_price = random.randint(200, 5000)
        discount = random.choice([0, 0.05, 0.1, 0.15])

        revenue = quantity * unit_price * (1 - discount)
        sales.append([
            order_id, date, customer_id, product_id, store_id,
            quantity, unit_price, discount, round(revenue, 2)
        ])

        order_id += 1

sales_df = pd.DataFrame(
    sales,
    columns=[
        "order_id", "date", "customer_id", "product_id",
        "store_id", "quantity", "unit_price", "discount", "revenue"
    ]
)


In [15]:
sales_df.head()

Unnamed: 0,order_id,date,customer_id,product_id,store_id,quantity,unit_price,discount,revenue
0,1,2024-08-02,1,15,1,3,2206,0.05,6287.1
1,2,2021-10-13,2,95,4,5,912,0.15,3876.0
2,3,2021-03-07,2,4,3,2,2105,0.0,4210.0
3,4,2024-02-24,3,26,18,4,2005,0.15,6817.0
4,5,2024-04-21,3,36,1,2,3662,0.1,6591.6


In [17]:
stores_df.head()

Unnamed: 0,store_id,store_type,region
0,1,Online,East
1,2,Offline,South
2,3,Offline,West
3,4,Offline,East
4,5,Online,North


In [19]:
products_df.head()

Unnamed: 0,product_id,category
0,1,Grocery
1,2,Clothing
2,3,Grocery
3,4,Clothing
4,5,Clothing


In [21]:
customers_df.head()

Unnamed: 0,customer_id,gender,age,city
0,1,Male,34,Pune
1,2,Female,26,Delhi
2,3,Male,50,Mumbai
3,4,Male,37,Bangalore
4,5,Male,30,Delhi


In [23]:
# -----------------------------
# SAVE RAW FILES
# -----------------------------
customers_df.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/retail-demand-customer-analytics/data/raw/customers_raw.csv", index=False)
products_df.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/retail-demand-customer-analytics/data/raw/products_raw.csv", index=False)
stores_df.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/retail-demand-customer-analytics/data/raw/stores_raw.csv", index=False)
sales_df.to_csv("C:/Users/Abhi/Desktop/Power_BI_Data_Analysis_Project/retail-demand-customer-analytics/data/raw/sales_raw.csv", index=False)