In [1]:
# Cell 1 — Connect to SQLite database
# sqlite3 is built into Python — no install needed
# Think of inventory.db as a file that acts like a mini-database

import sqlite3
import pandas as pd
import os

# Create connection — if inventory.db doesn't exist, SQLite creates it
conn = sqlite3.connect("../inventory.db")
print("✅ Connected to inventory.db")

✅ Connected to inventory.db


In [2]:
# Cell 2 — Load CSVs into the database
# .to_sql() writes a DataFrame as a table inside the database
# if_exists='replace' → overwrite if table already exists (good for dev)
# if_exists='append'  → add rows to existing table (good for daily updates)
# index=False         → don't write the DataFrame row numbers as a column

tables = {
    "purchase_prices": "../data/purchase_prices.csv",
    "purchases":       "../data/purchases.csv",
    "vendor_invoice":  "../data/vendor_invoice.csv",
    "sales":           "../data/sales.csv",
}

for table_name, filepath in tables.items():
    df = pd.read_csv(filepath, low_memory=False)
    df.to_sql(table_name, conn, if_exists='replace', index=False)
    print(f"  ✅ {table_name:20s} → {len(df):>7,} rows inserted")

conn.commit()  # always commit after writing!
print("\nAll tables loaded into database ✅")

  ✅ purchase_prices      →      15 rows inserted
  ✅ purchases            →  40,000 rows inserted
  ✅ vendor_invoice       →  40,000 rows inserted
  ✅ sales                → 100,000 rows inserted

All tables loaded into database ✅


In [3]:
# Cell 3 — Verify: list all tables in the database
# sqlite_master is a special internal table that stores schema info

tables_in_db = pd.read_sql("""
    SELECT name, type 
    FROM sqlite_master 
    WHERE type = 'table'
    ORDER BY name
""", conn)

print("Tables in inventory.db:")
print(tables_in_db)

Tables in inventory.db:
                   name   type
0       purchase_prices  table
1             purchases  table
2                 sales  table
3        vendor_invoice  table
4  vendor_sales_summary  table


In [4]:
# Cell 4 — Explore each table: row count + first 5 rows
# This is what a senior analyst does FIRST — understand the data before touching it

table_names = ["purchase_prices", "purchases", "vendor_invoice", "sales"]

for table in table_names:
    count = pd.read_sql(f"SELECT COUNT(*) as total FROM {table}", conn)
    print(f"\n{'='*55}")
    print(f"TABLE: {table.upper()}  |  Rows: {count['total'][0]:,}")
    print('='*55)
    sample = pd.read_sql(f"SELECT * FROM {table} LIMIT 5", conn)
    print(sample.to_string())


TABLE: PURCHASE_PRICES  |  Rows: 15
   Brand             Description  Size  Volume  Classification  PurchasePrice  VendorNumber      VendorName
0   5001  Samsung 55inch QLED TV  Unit   750.0               1          38000          1001   Samsung India
1   5002      Samsung Galaxy S24  Unit   750.0               1          45000          1001   Samsung India
2   5003    LG Front Load Washer  Unit   750.0               2          22000          1002  LG Electronics
3   5004       LG OLED 65inch TV  Unit   750.0               2          55000          1002  LG Electronics
4   5005   Sony Bravia 4K 50inch  Unit   750.0               3          32000          1003      Sony India

TABLE: PURCHASES  |  Rows: 40,000
  InventoryId  Store  Brand               Description  Size  VendorNumber      VendorName  PONumber      PODate ReceivingDate InvoiceDate     PayDate  PurchasePrice  Quantity  Dollars  Classification
0      5-5001      5   5001    Samsung 55inch QLED TV  Unit          1001   Sams

In [5]:
# Cell 5 — SQL Lesson: SELECT, WHERE, GROUP BY, ORDER BY
# These 4 clauses cover 90% of what analysts use daily

# 5a: Basic SELECT — get specific columns
print("--- 5a: Basic SELECT ---")
df = pd.read_sql("""
    SELECT VendorNumber, VendorName, Brand, PurchasePrice
    FROM purchases
    LIMIT 10
""", conn)
print(df)

--- 5a: Basic SELECT ---
   VendorNumber      VendorName  Brand  PurchasePrice
0          1001   Samsung India   5001          38000
1          1006   Havells India   5009           1800
2          1006   Havells India   5009           1800
3          1002  LG Electronics   5004          55000
4          1009   Blue Star Ltd   5012           8500
5          1001   Samsung India   5002          45000
6          1001   Samsung India   5001          38000
7          1001   Samsung India   5014          35000
8          1002  LG Electronics   5004          55000
9          1008      Voltas Ltd   5011          26000


In [6]:
# 5b: WHERE clause — filter rows by condition
# Here we filter out the zero-price data errors we intentionally added
print("--- 5b: WHERE filter ---")
df = pd.read_sql("""
    SELECT VendorNumber, Brand, PurchasePrice, Quantity, Dollars
    FROM purchases
    WHERE PurchasePrice > 0
    LIMIT 10
""", conn)
print(df)

# How many zero-price records are there?
zero_count = pd.read_sql("SELECT COUNT(*) as cnt FROM purchases WHERE PurchasePrice = 0", conn)
print(f"\nZero-price records to filter: {zero_count['cnt'][0]:,}")

--- 5b: WHERE filter ---
   VendorNumber  Brand  PurchasePrice  Quantity  Dollars
0          1001   5001          38000        10   380000
1          1006   5009           1800        21    37800
2          1006   5009           1800         3     5400
3          1002   5004          55000        21  1155000
4          1009   5012           8500        13   110500
5          1001   5002          45000         6   270000
6          1001   5001          38000        20   760000
7          1001   5014          35000        14   490000
8          1002   5004          55000         5   275000
9          1008   5011          26000         8   208000

Zero-price records to filter: 1,955


In [7]:
# 5c: GROUP BY + aggregate functions
# This is where SQL becomes powerful — summarize millions of rows instantly
# SUM(), AVG(), COUNT(), MIN(), MAX() are aggregate functions

print("--- 5c: GROUP BY — total purchases per vendor ---")
df = pd.read_sql("""
    SELECT 
        VendorNumber,
        VendorName,
        COUNT(*)           AS total_orders,
        SUM(Quantity)      AS total_units,
        SUM(Dollars)       AS total_spend,
        AVG(PurchasePrice) AS avg_price
    FROM purchases
    WHERE PurchasePrice > 0
    GROUP BY VendorNumber, VendorName
    ORDER BY total_spend DESC
""", conn)
print(df.to_string())

--- 5c: GROUP BY — total purchases per vendor ---
   VendorNumber         VendorName  total_orders  total_units  total_spend     avg_price
0          1001      Samsung India          7645       106044   4169655000  39316.939176
1          1002     LG Electronics          7657       107461   3405718000  31717.252188
2          1003         Sony India          4908        68235   1375008000  20244.498778
3          1008         Voltas Ltd          2496        34564    898664000  26000.000000
4          1007    Whirlpool India          2553        35819    859656000  24000.000000
5          1004         Bosch Home          2620        36983    665694000  18000.000000
6          1009      Blue Star Ltd          2470        34550    293675000   8500.000000
7          1010  Godrej Appliances          2580        36263    199446500   5500.000000
8          1005      Philips India          2613        36760    165420000   4500.000000
9          1006      Havells India          2503        3495

In [8]:
# 5d: JOIN — combine data from two tables
# JOIN is the most important SQL concept for analysts
# Here we join purchases with purchase_prices to get volume info

print("--- 5d: JOIN two tables ---")
df = pd.read_sql("""
    SELECT 
        p.VendorNumber,
        p.VendorName,
        p.Brand,
        p.Description,
        pp.Volume,
        SUM(p.Quantity) AS TotalQty,
        SUM(p.Dollars)  AS TotalDollars
    FROM purchases p
    JOIN purchase_prices pp ON p.Brand = pp.Brand
    WHERE p.PurchasePrice > 0
    GROUP BY p.VendorNumber, p.VendorName, p.Brand, p.Description, pp.Volume
    ORDER BY TotalDollars DESC
    LIMIT 10
""", conn)
print(df.to_string())

--- 5d: JOIN two tables ---
   VendorNumber       VendorName  Brand             Description  Volume  TotalQty  TotalDollars
0          1002   LG Electronics   5004       LG OLED 65inch TV   750.0     35928    1976040000
1          1001    Samsung India   5002      Samsung Galaxy S24   750.0     35172    1582740000
2          1001    Samsung India   5001  Samsung 55inch QLED TV   750.0     35465    1347670000
3          1001    Samsung India   5014   Samsung Galaxy Tab S9   750.0     35407    1239245000
4          1003       Sony India   5005   Sony Bravia 4K 50inch   750.0     34547    1105504000
5          1008       Voltas Ltd   5011    Voltas 1.5T Split AC   750.0     34564     898664000
6          1007  Whirlpool India   5010  Whirlpool Refrigerator   750.0     35819     859656000
7          1002   LG Electronics   5003    LG Front Load Washer   750.0     35521     781462000
8          1004       Bosch Home   5007        Bosch Dishwasher   750.0     36983     665694000
9          1