In [1]:
# Cell 1 — Test all imports work
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os

print("✅ All libraries imported successfully!")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")

✅ All libraries imported successfully!
pandas version: 2.2.2
numpy version: 1.26.4


In [3]:
# Cell 1 — Imports and setup
# pandas: works with tables of data (like Excel but in Python)
# numpy: math operations, random numbers
# random: built-in Python randomness
# os: interact with your file system (create folders, check paths)
# datetime: work with dates

import pandas as pd
import numpy as np
import random
import os
from datetime import date, timedelta

# Set a "seed" so random numbers are the same every time you run
# Without this, you'd get different data each run — bad for reproducibility
np.random.seed(42)
random.seed(42)

print("Libraries loaded ✅")

Libraries loaded ✅


In [9]:
VENDORS = [
    (1001, "Samsung India"),
    (1002, "LG Electronics"),
    (1003, "Sony India"),
    (1004, "Bosch Home"),
    (1005, "Philips India"),
    (1006, "Havells India"),
    (1007, "Whirlpool India"),
    (1008, "Voltas Ltd"),
    (1009, "Blue Star Ltd"),
    (1010, "Godrej Appliances"),
]

BRANDS = [
    (5001, "Samsung 55inch QLED TV",   1001, "Unit", 38000, 65000, 1),
    (5002, "Samsung Galaxy S24",       1001, "Unit", 45000, 79999, 1),
    (5003, "LG Front Load Washer",     1002, "Unit", 22000, 38999, 2),
    (5004, "LG OLED 65inch TV",        1002, "Unit", 55000, 95000, 2),
    (5005, "Sony Bravia 4K 50inch",    1003, "Unit", 32000, 54999, 3),
    (5006, "Sony WH1000XM5 Headphone", 1003, "Unit",  8000, 16999, 3),
    (5007, "Bosch Dishwasher",         1004, "Unit", 18000, 32999, 4),
    (5008, "Philips Air Fryer",        1005, "Unit",  4500,  8999, 5),
    (5009, "Havells Ceiling Fan",      1006, "Unit",  1800,  3499, 6),
    (5010, "Whirlpool Refrigerator",   1007, "Unit", 24000, 42999, 7),
    (5011, "Voltas 1.5T Split AC",     1008, "Unit", 26000, 44999, 8),
    (5012, "Blue Star Water Purifier", 1009, "Unit",  8500, 15999, 9),
    (5013, "Godrej Microwave",         1010, "Unit",  5500, 10999, 10),
    (5014, "Samsung Galaxy Tab S9",    1001, "Unit", 35000, 59999, 1),
    (5015, "LG Window AC 1T",          1002, "Unit", 18000, 31999, 2),
]

STORES     = list(range(1, 11))
START_DATE = date(2024, 1, 1)
END_DATE   = date(2024, 12, 31)

print(f"Vendors defined: {len(VENDORS)}")
print(f"Brands defined:  {len(BRANDS)}")
print(f"Stores:          {len(STORES)}")

Vendors defined: 10
Brands defined:  15
Stores:          10


In [11]:
# Cell 3 — Helper function: generate a random date between two dates
# We use this everywhere to stamp purchases and sales with realistic dates

def random_date(start=START_DATE, end=END_DATE):
    """Return a random date between start and end."""
    days_between = (end - start).days        # how many days in range
    random_days  = random.randint(0, days_between)
    return start + timedelta(days=random_days)

# Quick test
print("Random date test:", random_date())
print("Random date test:", random_date())

Random date test: 2024-11-23
Random date test: 2024-02-27


In [13]:
# Cell 4 — Generate Purchase Prices table
# This is a "master" / "lookup" table — one row per brand
# It stores the official purchase price for each brand

def make_purchase_prices():
    rows = []
    for brand in BRANDS:
        brand_id, desc, vendor_num, size, pp, sp, cls = brand
        # Find the vendor name by matching vendor_num
        vendor_name = next(v[1] for v in VENDORS if v[0] == vendor_num)
        rows.append({
            "Brand":         brand_id,
            "Description":   desc,
            "Size":          size,
            "Volume":        750.0,       # standardized to ml equivalent
            "Classification":cls,
            "PurchasePrice": pp,          # price WE pay
            "VendorNumber":  vendor_num,
            "VendorName":    vendor_name,
        })
    return pd.DataFrame(rows)

purchase_prices_df = make_purchase_prices()
print(f"Purchase Prices table shape: {purchase_prices_df.shape}")
purchase_prices_df.head()  # .head() shows first 5 rows — always check your data!

Purchase Prices table shape: (15, 8)


Unnamed: 0,Brand,Description,Size,Volume,Classification,PurchasePrice,VendorNumber,VendorName
0,5001,Samsung 55inch QLED TV,Unit,750.0,1,38000,1001,Samsung India
1,5002,Samsung Galaxy S24,Unit,750.0,1,45000,1001,Samsung India
2,5003,LG Front Load Washer,Unit,750.0,2,22000,1002,LG Electronics
3,5004,LG OLED 65inch TV,Unit,750.0,2,55000,1002,LG Electronics
4,5005,Sony Bravia 4K 50inch,Unit,750.0,3,32000,1003,Sony India


In [15]:
# Cell 5 — Generate Purchases table (the big transactional table)
# Every time a STORE orders from a VENDOR, that's one purchase record
# This mimics how real ERP systems like SAP store purchase orders

def make_purchases(n=40000):
    rows = []
    po_number = 10000   # purchase order number, increments each order

    for _ in range(n):
        # Pick a random brand and a random store for this purchase
        brand    = random.choice(BRANDS)
        brand_id, desc, vendor_num, size, pp, sp, cls = brand
        store    = random.choice(STORES)

        # Small quantity per order → realistic, leaves room for profit
        quantity = random.randint(3, 25)

        # ~5% of records have zero price — simulating data entry errors
        # We'll filter these out later in our cleaning step
        actual_price = pp if random.random() > 0.05 else 0

        # Dates flow logically: PO created → received → invoice → payment
        po_date       = random_date()
        receiving_date= po_date    + timedelta(days=random.randint(2, 14))
        invoice_date  = receiving_date + timedelta(days=random.randint(1, 7))
        pay_date      = invoice_date   + timedelta(days=random.randint(15, 45))

        vendor_name = next(v[1] for v in VENDORS if v[0] == vendor_num)

        rows.append({
            "InventoryId":   f"{store}-{brand_id}",
            "Store":         store,
            "Brand":         brand_id,
            "Description":   desc,
            "Size":          size,
            "VendorNumber":  vendor_num,
            "VendorName":    vendor_name,
            "PONumber":      po_number,
            "PODate":        po_date,
            "ReceivingDate": receiving_date,
            "InvoiceDate":   invoice_date,
            "PayDate":       pay_date,
            "PurchasePrice": actual_price,
            "Quantity":      quantity,
            "Dollars":       round(actual_price * quantity, 2),
            "Classification":cls,
        })
        po_number += 1

    return pd.DataFrame(rows)

purchases_df = make_purchases(40000)
print(f"Purchases table shape: {purchases_df.shape}")
print(f"\nZero-price records (data errors): {(purchases_df['PurchasePrice'] == 0).sum()}")
purchases_df.head()

Purchases table shape: (40000, 16)

Zero-price records (data errors): 1955


Unnamed: 0,InventoryId,Store,Brand,Description,Size,VendorNumber,VendorName,PONumber,PODate,ReceivingDate,InvoiceDate,PayDate,PurchasePrice,Quantity,Dollars,Classification
0,5-5001,5,5001,Samsung 55inch QLED TV,Unit,1001,Samsung India,10000,2024-02-22,2024-03-05,2024-03-11,2024-04-23,38000,10,380000,1
1,2-5009,2,5009,Havells Ceiling Fan,Unit,1006,Havells India,10001,2024-01-16,2024-01-19,2024-01-21,2024-02-12,1800,21,37800,6
2,10-5009,10,5009,Havells Ceiling Fan,Unit,1006,Havells India,10002,2024-11-28,2024-12-11,2024-12-16,2025-01-13,1800,3,5400,6
3,8-5004,8,5004,LG OLED 65inch TV,Unit,1002,LG Electronics,10003,2024-01-04,2024-01-18,2024-01-25,2024-02-14,55000,21,1155000,2
4,7-5012,7,5012,Blue Star Water Purifier,Unit,1009,Blue Star Ltd,10004,2024-04-20,2024-05-04,2024-05-07,2024-05-25,8500,13,110500,9


In [17]:
# Cell 6 — Generate Vendor Invoice table
# The vendor sends us an invoice for each purchase order
# IMPORTANT: This adds a "Freight" (shipping cost) column not in purchases

def make_vendor_invoice(purchases_df):
    rows = []
    # One invoice row per unique PO number
    for _, row in purchases_df.drop_duplicates("PONumber").iterrows():
        rows.append({
            "VendorNumber": row["VendorNumber"],
            "VendorName":   row["VendorName"],
            "InvoiceDate":  row["InvoiceDate"],
            "PONumber":     row["PONumber"],
            "PODate":       row["PODate"],
            "Quantity":     row["Quantity"],
            "Dollars":      row["Dollars"],
            "Freight":      round(random.uniform(15, 200), 2),  # shipping cost per order
        })
    return pd.DataFrame(rows)

vendor_invoice_df = make_vendor_invoice(purchases_df)
print(f"Vendor Invoice shape: {vendor_invoice_df.shape}")
vendor_invoice_df.head()

Vendor Invoice shape: (40000, 8)


Unnamed: 0,VendorNumber,VendorName,InvoiceDate,PONumber,PODate,Quantity,Dollars,Freight
0,1001,Samsung India,2024-03-11,10000,2024-02-22,10,380000,138.07
1,1006,Havells India,2024-01-21,10001,2024-01-16,21,37800,100.33
2,1006,Havells India,2024-12-16,10002,2024-11-28,3,5400,77.81
3,1002,LG Electronics,2024-01-25,10003,2024-01-04,21,1155000,175.37
4,1009,Blue Star Ltd,2024-05-07,10004,2024-04-20,13,110500,88.1


In [19]:
# Cell 7 — Generate Sales table
# Every time a CUSTOMER buys from a STORE, that's one sales record
# Sales volume is ~2.5x purchases so we make profit (sales > purchase cost)

def make_sales(n=100000):
    rows = []
    for _ in range(n):
        brand   = random.choice(BRANDS)
        brand_id, desc, vendor_num, size, pp, sp, cls = brand
        store   = random.choice(STORES)

        qty         = random.randint(1, 12)
        # Selling price varies ±8% around listed price (realistic market variance)
        actual_sp   = round(sp * random.uniform(0.92, 1.08), 2)
        vendor_name = next(v[1] for v in VENDORS if v[0] == vendor_num)

        rows.append({
            "InventoryId":   f"{store}-{brand_id}",
            "Store":         store,
            "Brand":         brand_id,
            "Description":   desc,
            "Size":          size,
            "VendorNumber":  vendor_num,
            "VendorName":    vendor_name,
            "SalesQuantity": qty,
            "SalesDollars":  round(actual_sp * qty, 2),
            "SalesPrice":    actual_sp,
            "SalesDate":     random_date(),
        })
    return pd.DataFrame(rows)

sales_df = make_sales(100000)
print(f"Sales table shape: {sales_df.shape}")
sales_df.head()

Sales table shape: (100000, 11)


Unnamed: 0,InventoryId,Store,Brand,Description,Size,VendorNumber,VendorName,SalesQuantity,SalesDollars,SalesPrice,SalesDate
0,3-5006,3,5006,Sony WH1000XM5 Headphone,Unit,1003,Sony India,12,194406.48,16200.54,2024-06-22
1,2-5006,2,5006,Sony WH1000XM5 Headphone,Unit,1003,Sony India,1,18340.4,18340.4,2024-06-07
2,1-5011,1,5011,Voltas 1.5T Split AC,Unit,1008,Voltas Ltd,4,169242.44,42310.61,2024-05-31
3,8-5012,8,5012,Blue Star Water Purifier,Unit,1009,Blue Star Ltd,11,186816.19,16983.29,2024-05-24
4,8-5013,8,5013,Godrej Microwave,Unit,1010,Godrej Appliances,9,92212.11,10245.79,2024-12-12
