In [1]:
# Step 1: Data understanding and profiling in Jupyter

import pandas as pd
import numpy as np

# 1) Load CSVs (use exact filenames as attached)
orders_path = "List of Orders.csv"
details_path = "Order Details.csv"
targets_path = "Sales target.csv"

orders = pd.read_csv(orders_path)
details = pd.read_csv(details_path)
targets = pd.read_csv(targets_path)

# Standardize column names (strip spaces, underscores)
orders.columns = [c.strip().replace(" ", "_") for c in orders.columns]
details.columns = [c.strip().replace(" ", "_") for c in details.columns]
targets.columns = [c.strip().replace(" ", "_") for c in targets.columns]

orders.head(), details.head(), targets.head()


(  Order_ID  Order_Date CustomerName           State       City
 0  B-25601  01-04-2018       Bharat         Gujarat  Ahmedabad
 1  B-25602  01-04-2018        Pearl     Maharashtra       Pune
 2  B-25603  03-04-2018        Jahan  Madhya Pradesh     Bhopal
 3  B-25604  03-04-2018       Divsha       Rajasthan     Jaipur
 4  B-25605  05-04-2018      Kasheen     West Bengal    Kolkata,
   Order_ID  Amount  Profit  Quantity     Category      Sub-Category
 0  B-25601  1275.0 -1148.0         7    Furniture         Bookcases
 1  B-25601    66.0   -12.0         5     Clothing             Stole
 2  B-25601     8.0    -2.0         3     Clothing       Hankerchief
 3  B-25601    80.0   -56.0         4  Electronics  Electronic Games
 4  B-25602   168.0  -111.0         2  Electronics            Phones,
   Month_of_Order_Date   Category   Target
 0              Apr-18  Furniture  10400.0
 1              May-18  Furniture  10500.0
 2              Jun-18  Furniture  10600.0
 3              Jul-18  Furn

In [2]:
# 2) Basic info and schema types
print("Orders info:")
print(orders.info())
print("\nOrder Details info:")
print(details.info())
print("\nSales Targets info:")
print(targets.info())


Orders info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Order_ID      500 non-null    object
 1   Order_Date    500 non-null    object
 2   CustomerName  500 non-null    object
 3   State         500 non-null    object
 4   City          500 non-null    object
dtypes: object(5)
memory usage: 22.0+ KB
None

Order Details info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Order_ID      1500 non-null   object 
 1   Amount        1500 non-null   float64
 2   Profit        1500 non-null   float64
 3   Quantity      1500 non-null   int64  
 4   Category      1500 non-null   object 
 5   Sub-Category  1500 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 70.4+ KB
None

Sal

In [3]:
# 3) Parse dates and numeric fields
# Orders: Order_Date looks like dd-mm-YYYY or similar; coerce to datetime
orders["Order_Date"] = pd.to_datetime(orders["Order_Date"], dayfirst=True, errors="coerce")

# Details: Amount, Profit, Quantity numeric
for col in ["Amount", "Profit", "Quantity"]:
    # Some CSVs may store numbers as strings; coerce
    details[col] = pd.to_numeric(details[col], errors="coerce")

# Targets: Month_of_Order_Date like "Apr-18"; parse to period-month
# Create a Date column as end of month for easier joins
targets.rename(columns={"Month_of_Order_Date": "Month"}, inplace=True)
if "Month" not in targets.columns:
    # fallback if header appears slightly different in the file
    # e.g., "Month of Order Date"
    for c in targets.columns:
        if c.lower().startswith("month"):
            targets.rename(columns={c: "Month"}, inplace=True)
            break

targets["Target"] = pd.to_numeric(targets["Target"], errors="coerce")
targets["Month_Period"] = pd.to_datetime(targets["Month"], format="%b-%y", errors="coerce")
targets["Month_End"] = targets["Month_Period"] + pd.offsets.MonthEnd(0)

orders["Month_End"] = orders["Order_Date"] + pd.offsets.MonthEnd(0)


In [4]:
# 4) Quick ERD sketch (as text)
erd = """
Entities:
- Orders (Order_ID, Order_Date, CustomerName, State, City)
- Order_Details (Order_ID, Amount, Profit, Quantity, Category, Sub-Category)
- Sales_Targets (Month, Category, Target)

Relationships:
- Orders 1..* ↔ Order_Details many-to-1 via Order_ID
- Sales_Targets links by Month_End (from Orders) and Category
"""
print(erd)



Entities:
- Orders (Order_ID, Order_Date, CustomerName, State, City)
- Order_Details (Order_ID, Amount, Profit, Quantity, Category, Sub-Category)
- Sales_Targets (Month, Category, Target)

Relationships:
- Orders 1..* ↔ Order_Details many-to-1 via Order_ID
- Sales_Targets links by Month_End (from Orders) and Category



In [5]:
# 5) Row counts and duplicates
summary = {}

summary["orders_rows"] = len(orders)
summary["details_rows"] = len(details)
summary["targets_rows"] = len(targets)

summary["orders_orderid_nulls"] = orders["Order_ID"].isna().sum()
summary["details_orderid_nulls"] = details["Order_ID"].isna().sum()

summary["orders_duplicates_Order_ID"] = orders["Order_ID"].duplicated().sum()  # should be 0 if one row per order
summary["details_duplicates_full_row"] = details.duplicated().sum()

summary


{'orders_rows': 560,
 'details_rows': 1500,
 'targets_rows': 36,
 'orders_orderid_nulls': 60,
 'details_orderid_nulls': 0,
 'orders_duplicates_Order_ID': 59,
 'details_duplicates_full_row': 0}

In [6]:
# 6) Date ranges and missing checks
date_profile = {
    "orders_min_date": orders["Order_Date"].min(),
    "orders_max_date": orders["Order_Date"].max(),
    "orders_date_nulls": orders["Order_Date"].isna().sum(),
}

num_nulls_details = details[["Amount","Profit","Quantity"]].isna().sum().to_dict()

date_profile, num_nulls_details


({'orders_min_date': Timestamp('2018-04-01 00:00:00'),
  'orders_max_date': Timestamp('2019-03-31 00:00:00'),
  'orders_date_nulls': 60},
 {'Amount': 0, 'Profit': 0, 'Quantity': 0})

In [7]:
# 7) Key distributions and categorical summaries
cat_summary = {
    "details_categories": details["Category"].value_counts(dropna=False).to_dict(),
    "details_subcategories_top10": details["Sub-Category"].value_counts().head(10).to_dict(),
    "orders_states_top10": orders["State"].value_counts().head(10).to_dict(),
    "orders_cities_top10": orders["City"].value_counts().head(10).to_dict(),
}
cat_summary


{'details_categories': {'Clothing': 949, 'Electronics': 308, 'Furniture': 243},
 'details_subcategories_top10': {'Saree': 210,
  'Hankerchief': 198,
  'Stole': 192,
  'Phones': 83,
  'Bookcases': 79,
  'Electronic Games': 79,
  'T-shirt': 77,
  'Printers': 74,
  'Chairs': 74,
  'Furnishings': 73},
 'orders_states_top10': {'Madhya Pradesh': 101,
  'Maharashtra': 90,
  'Rajasthan': 32,
  'Gujarat': 27,
  'Punjab': 25,
  'Uttar Pradesh': 22,
  'Delhi': 22,
  'West Bengal': 22,
  'Karnataka': 21,
  'Kerala ': 16},
 'orders_cities_top10': {'Indore': 76,
  'Mumbai': 68,
  'Chandigarh': 30,
  'Delhi': 25,
  'Bhopal': 22,
  'Kolkata': 22,
  'Pune': 22,
  'Bangalore': 21,
  'Jaipur': 19,
  'Ahmedabad': 17}}

In [8]:
# 8) Join sanity checks: how many details match an order?
matched = details["Order_ID"].isin(orders["Order_ID"]).sum()
unmatched = len(details) - matched

matched, unmatched, round(100*matched/len(details),2)


(1500, 0, 100.0)

In [9]:
# 9) Basic KPI checks
# Order-level aggregates from details
agg_details = details.groupby("Order_ID", as_index=False).agg(
    lines=("Order_ID","size"),
    revenue=("Amount","sum"),
    profit=("Profit","sum"),
    items=("Quantity","sum")
)

# Join to orders to attach date and region
orders_agg = orders[["Order_ID","Order_Date","Month_End","State","City"]].merge(
    agg_details, on="Order_ID", how="left"
)

# Simple sanity: totals and timeframe coverage
kpis = {
    "total_revenue": float(agg_details["revenue"].sum()),
    "total_profit": float(agg_details["profit"].sum()),
    "total_orders": int(orders["Order_ID"].nunique()),
    "orders_with_lines": int(orders_agg["lines"].notna().sum()),
    "orders_no_lines": int(orders_agg["lines"].isna().sum()),
    "min_order_date": str(orders["Order_Date"].min().date()) if pd.notna(orders["Order_Date"].min()) else None,
    "max_order_date": str(orders["Order_Date"].max().date()) if pd.notna(orders["Order_Date"].max()) else None,
}
kpis


{'total_revenue': 431502.0,
 'total_profit': 23955.0,
 'total_orders': 500,
 'orders_with_lines': 500,
 'orders_no_lines': 60,
 'min_order_date': '2018-04-01',
 'max_order_date': '2019-03-31'}

In [10]:
# 10) Monthly revenue vs target readiness (aggregates only, no chart)
monthly_rev = orders_agg.groupby(["Month_End"], as_index=False)["revenue"].sum()

# Align to category-level by month for future target comparisons
details_with_month = details.merge(orders[["Order_ID","Month_End"]], on="Order_ID", how="left")
monthly_cat_rev = details_with_month.groupby(["Month_End","Category"], as_index=False)["Amount"].sum().rename(columns={"Amount":"Revenue"})

# Prepare targets keyed by Month_End and Category
targets_k = targets.rename(columns={"Category":"Category", "Target":"Target"})[["Month_End","Category","Target"]]

monthly_cat_ready = monthly_cat_rev.merge(targets_k, on=["Month_End","Category"], how="left")
monthly_cat_ready.head()


Unnamed: 0,Month_End,Category,Revenue,Target
0,2018-04-30,Clothing,13478.0,12000.0
1,2018-04-30,Electronics,11127.0,9000.0
2,2018-04-30,Furniture,8121.0,10400.0
3,2018-05-31,Clothing,9518.0,12000.0
4,2018-05-31,Electronics,12807.0,9000.0


In [11]:
# 11) Save lightweight profiling outputs (CSV) for documentation
orders_profile = orders.describe(include="all").transpose()
details_profile = details.describe(include="all").transpose()
targets_profile = targets.describe(include="all").transpose()

orders_profile.to_csv("docs_orders_profile.csv", index=True)
details_profile.to_csv("docs_details_profile.csv", index=True)
targets_profile.to_csv("docs_targets_profile.csv", index=True)

monthly_rev.to_csv("docs_monthly_revenue.csv", index=False)
monthly_cat_ready.to_csv("docs_monthly_category_revenue_target.csv", index=False)

print("Saved profiling and aggregates CSVs.")


Saved profiling and aggregates CSVs.
