In [6]:
import pandas as pd
import sqlite3
import os

In [7]:
#Path
CSV_PATH = "../data/SuperstoreData.csv" 
DB_PATH  = "../sql/superstore.db"

In [8]:
#Load & CLean Data
df = pd.read_csv(CSV_PATH, encoding="latin1")

df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [9]:
for col in ["Sales", "Profit", "Discount", "Quantity"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Convert 'Order Date' to datetime
df["Order Date"] = pd.to_datetime(df["Order Date"], format="%m/%d/%Y", errors='coerce')
# Convert 'Ship Date' to datetime
df["Ship Date"] = pd.to_datetime(df["Ship Date"], format="%m/%d/%Y", errors='coerce')


df = df.dropna(subset=["Order Date", "Sales"]).copy()
df["OrderYear"]      = df["Order Date"].dt.year
df["OrderMonth"]     = df["Order Date"].dt.month
df["OrderYearMonth"] = df["Order Date"].dt.to_period("M").astype(str)  # e.g., 2017-05
df["ShipDays"]       = (df["Ship Date"] - df["Order Date"]).dt.days

In [10]:
# SQLite DB + main table
if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)
df.to_sql("orders", conn, if_exists="replace", index=False)

# Helpful indexes
with conn:
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_yearmonth ON orders(OrderYearMonth);")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_customer  ON orders('Customer Name');")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_region    ON orders(Region);")
    conn.execute("CREATE INDEX IF NOT EXISTS idx_orders_category  ON orders(Category);")





# resume-grade views
views_sql = """
CREATE VIEW IF NOT EXISTS v_monthly_kpis AS
SELECT OrderYearMonth AS year_month,
       SUM(Sales)      AS total_sales,
       SUM(Profit)     AS total_profit,
       COUNT(*)        AS total_orders,
       AVG(Discount)   AS avg_discount,
       AVG(ShipDays)   AS avg_ship_days
FROM orders
GROUP BY OrderYearMonth
ORDER BY OrderYearMonth;

CREATE VIEW IF NOT EXISTS v_top_customers AS
SELECT "Customer Name" AS customer_name,
       SUM(Sales)      AS total_sales,
       SUM(Profit)     AS total_profit,
       COUNT(*)        AS order_count
FROM orders
GROUP BY "Customer Name"
ORDER BY total_sales DESC;

CREATE VIEW IF NOT EXISTS v_profit_vs_discount AS
SELECT Discount    AS discount_level,
       AVG(Profit) AS avg_profit,
       COUNT(*)    AS row_count
FROM orders
GROUP BY Discount
ORDER BY Discount;

CREATE VIEW IF NOT EXISTS v_region_performance AS
SELECT Region,
       SUM(Sales)  AS total_sales,
       SUM(Profit) AS total_profit,
       COUNT(*)    AS order_count
FROM orders
GROUP BY Region
ORDER BY total_sales DESC;

CREATE VIEW IF NOT EXISTS v_loss_orders_by_category AS
SELECT Category,
       COUNT(*) AS loss_orders
FROM orders
WHERE Profit < 0
GROUP BY Category
ORDER BY loss_orders DESC;

CREATE VIEW IF NOT EXISTS v_product_profitability AS
SELECT "Product ID"   AS product_id,
       "Product Name" AS product_name,
       Category,
       "Sub-Category" AS sub_category,
       SUM(Sales)     AS total_sales,
       SUM(Profit)    AS total_profit,
       AVG(Discount)  AS avg_discount,
       COUNT(*)       AS order_count
FROM orders
GROUP BY "Product ID","Product Name",Category,"Sub-Category"
ORDER BY total_profit DESC;

CREATE VIEW IF NOT EXISTS v_segment_region_matrix AS
SELECT Segment,
       Region,
       SUM(Sales)  AS total_sales,
       SUM(Profit) AS total_profit,
       COUNT(*)    AS total_orders
FROM orders
GROUP BY Segment, Region
ORDER BY total_sales DESC;
"""
with conn:
    conn.executescript(views_sql)

conn.close()
print("✅ Built superstore.db with analytics views.")


✅ Built superstore.db with analytics views.
