In [None]:
# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager."""
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row[MGR_NAME_COL],
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": None,
            "Manager Name": None,
            "Manager GCB": None,
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # -------------------------------
    # Exceptions: trace up the chain until first missing manager is found
    # -------------------------------
    all_emp_ids = set(enriched[EMP_ID_COL])

    for _, row in enriched.iterrows():
        if row[GCB_COL] == "MD":
            continue  # MDs are top, never exceptions

        current_mgr_id = row[MGR_ID_COL]
        missing_mgr_id = None
        missing_mgr_name = None

        # climb upwards until we either reach MD or hit a missing manager
        while pd.notna(current_mgr_id):
            if current_mgr_id not in all_emp_ids:
                missing_mgr_id = current_mgr_id
                # take the manager name field from this row (the direct manager not found)
                missing_mgr_name = row.get(MGR_NAME_COL, None)
                break

            # move to next manager in chain
            mgr_row = enriched.loc[enriched[EMP_ID_COL] == current_mgr_id]
            if mgr_row.empty:
                break
            current_mgr_id = mgr_row.iloc[0][MGR_ID_COL]

        if missing_mgr_id:
            rec = row.to_dict()
            rec["Missing Manager ID"] = missing_mgr_id
            rec["Missing Manager Name"] = missing_mgr_name
            exceptions.append(rec)

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Single consolidated Summary sheet --------
    summary_tables = []

    # overall counts
    summary = pd.DataFrame([{
        "Metric": "Total Employees", "Value": len(enriched)
    }, {
        "Metric": "Total in Hierarchy", "Value": hierarchy_df[EMP_ID_COL].nunique()
    }, {
        "Metric": "Missing Managers Count", "Value": len(exceptions_df)
    }])
    summary_tables.append(("Overall Summary", summary))

    # per-MD
    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]
    summary_tables.append(("Headcount per MD", per_md))

    # ---- Prepare a safe manager-name map from enriched (string keys) ----
    manager_name_map = {}
    if EMP_ID_COL in enriched.columns and EMP_NAME_COL in enriched.columns:
        mgr_df = enriched[[EMP_ID_COL, EMP_NAME_COL]].drop_duplicates().copy()
        mgr_df[EMP_ID_COL] = mgr_df[EMP_ID_COL].astype(str).str.strip()
        mgr_df[EMP_NAME_COL] = mgr_df[EMP_NAME_COL].astype(str).str.strip()
        manager_name_map = mgr_df.set_index(EMP_ID_COL)[EMP_NAME_COL].to_dict()

    # Make sure Manager GCB column exists in hierarchy_df (if not, create safe NA column)
    if "Manager GCB" not in hierarchy_df.columns:
        hierarchy_df["Manager GCB"] = pd.NA

    # Convert Manager GCB to numeric where possible for correct matching (coerce errors)
    mgr_gcb_numeric = pd.to_numeric(hierarchy_df["Manager GCB"], errors="coerce")

    # -------------------------------
    # Per-GCB3: direct report counts
    # -------------------------------
    per_gcb3_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 3]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb3_counts["Manager ID"] = per_gcb3_counts["Manager ID"].astype(str).str.strip()
    per_gcb3_counts["GCB3 Name"] = per_gcb3_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb3 = per_gcb3_counts[["GCB3 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb3.columns = ["GCB3 Name", "GCB3 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB3", per_gcb3))

    # -------------------------------
    # Per-GCB4: direct report counts
    # -------------------------------
    per_gcb4_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 4]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb4_counts["Manager ID"] = per_gcb4_counts["Manager ID"].astype(str).str.strip()
    per_gcb4_counts["GCB4 Name"] = per_gcb4_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb4 = per_gcb4_counts[["GCB4 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb4.columns = ["GCB4 Name", "GCB4 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB4", per_gcb4))

    # -------------------------------
    # WRITE OUTPUT (Hierarchy + Missing Managers + ONE Summary sheet stacked)
    # -------------------------------
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        # Write all summaries stacked in one sheet with title rows
        start_row = 0
        for title, df in summary_tables:
            pd.DataFrame([[title]]).to_excel(writer, index=False, header=False,
                                             sheet_name="Summary", startrow=start_row)
            start_row += 1
            df.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(df) + 2

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


In [None]:
# -------------------------
# Phase 2: Build Hierarchy (updated missing-manager tracing)
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions (improved missing-manager tracing)."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    # NORMALIZE key columns to strings for reliable matching
    enriched = enriched.copy()
    enriched[EMP_ID_COL] = enriched[EMP_ID_COL].astype(str).str.strip()
    # If MGR_ID_COL not present, create empty column
    if MGR_ID_COL not in enriched.columns:
        enriched[MGR_ID_COL] = ""
    else:
        enriched[MGR_ID_COL] = enriched[MGR_ID_COL].astype(str).str.strip()
    # Normalize names and GCB for safety
    enriched[EMP_NAME_COL] = enriched[EMP_NAME_COL].astype(str).str.strip()
    if MGR_NAME_COL in enriched.columns:
        enriched[MGR_NAME_COL] = enriched[MGR_NAME_COL].astype(str).str.strip()
    if GCB_COL in enriched.columns:
        enriched[GCB_COL] = enriched[GCB_COL].astype(str).str.strip().str.upper()

    # Build lookup using normalized Employee ID
    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager (manager_id must be normalized string)."""
        if manager_id is None:
            return
        manager_id = str(manager_id).strip()
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row.get(MGR_NAME_COL, ""),
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs (normalized)
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself (full columns)
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": "",
            "Manager Name": "",
            "Manager GCB": "",
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # ------------------------------
    # Exceptions: improved missing-manager tracing
    # ------------------------------
    # helper: find first missing manager in the upward chain for a given employee row
    def find_first_missing_manager(emp_row):
        """
        Walk upwards from emp_row[MGR_ID_COL] and:
         - return (missing_mgr_id, missing_mgr_name, chain_str) when first missing manager encountered
         - return (None, None, chain_str) if chain reaches MD (i.e. no missing found)
        """
        visited = set()
        chain = []  # collect tuples (id, name_or_empty)
        current_mgr = str(emp_row.get(MGR_ID_COL, "")).strip()

        while current_mgr:
            # avoid infinite loops
            if current_mgr in visited:
                chain.append((current_mgr, "(circular)"))
                return current_mgr, "", " -> ".join(f"{i}({n})" for i, n in chain)
            visited.add(current_mgr)

            # manager exists in enriched? continue upward
            if current_mgr in df_lookup.index:
                mgr_row = df_lookup.loc[current_mgr]
                mgr_name = str(mgr_row.get(EMP_NAME_COL, "")).strip()
                chain.append((current_mgr, mgr_name or ""))
                # if this manager is MD, chain completes successfully (no missing)
                mgr_gcb = str(mgr_row.get(GCB_COL, "")).strip().upper()
                if mgr_gcb == "MD":
                    # build chain string and indicate no missing
                    chain_str = " -> ".join(f"{n or i}" for i, n in chain)
                    return None, None, chain_str
                # move up
                next_mgr = mgr_row.get(MGR_ID_COL, "")
                current_mgr = str(next_mgr).strip() if pd.notna(next_mgr) else ""
            else:
                # current_mgr is not found in enriched -> this is the first missing manager
                # try to recover a name for this missing manager from any rows that reference them
                # (many employees might have MGR_ID == current_mgr and have the manager's name in MGR_NAME_COL)
                possible_names = enriched.loc[enriched[MGR_ID_COL] == current_mgr, MGR_NAME_COL] \
                                 if MGR_NAME_COL in enriched.columns else pd.Series([], dtype=object)
                possible_names = possible_names.dropna().astype(str).str.strip()
                missing_name = possible_names.mode().iloc[0] if not possible_names.empty else ""
                chain.append((current_mgr, missing_name))
                chain_str = " -> ".join(f"{n or i}" for i, n in chain) + " (MISSING)"
                return current_mgr, missing_name, chain_str

        # if we exit loop without encountering MD or missing (e.g., no manager)
        return None, None, "No manager chain"

    # Apply find_first_missing_manager to every employee row and capture exceptions
    for _, row in enriched.iterrows():
        missing_id, missing_name, chain_str = find_first_missing_manager(row)
        if missing_id:  # found a missing manager somewhere above this employee
            row_dict = row.to_dict()
            row_dict["Missing Manager ID"] = missing_id
            row_dict["Missing Manager Name"] = missing_name
            row_dict["Missing Chain"] = chain_str
            exceptions.append(row_dict)

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Single consolidated Summary sheet (unchanged logic) --------
    summary_tables = []

    # overall counts
    summary = pd.DataFrame([{
        "Metric": "Total Employees", "Value": len(enriched)
    }, {
        "Metric": "Total in Hierarchy", "Value": hierarchy_df[EMP_ID_COL].nunique()
    }, {
        "Metric": "Missing Managers Count", "Value": len(exceptions_df)
    }])
    summary_tables.append(("Overall Summary", summary))

    # per-MD
    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]
    summary_tables.append(("Headcount per MD", per_md))

    # ---- Prepare manager-name map ----
    manager_name_map = {}
    if EMP_ID_COL in enriched.columns and EMP_NAME_COL in enriched.columns:
        mgr_df = enriched[[EMP_ID_COL, EMP_NAME_COL]].drop_duplicates().copy()
        mgr_df[EMP_ID_COL] = mgr_df[EMP_ID_COL].astype(str).str.strip()
        mgr_df[EMP_NAME_COL] = mgr_df[EMP_NAME_COL].astype(str).str.strip()
        manager_name_map = mgr_df.set_index(EMP_ID_COL)[EMP_NAME_COL].to_dict()

    # Ensure Manager GCB column exists
    if "Manager GCB" not in hierarchy_df.columns:
        hierarchy_df["Manager GCB"] = pd.NA

    mgr_gcb_numeric = pd.to_numeric(hierarchy_df["Manager GCB"], errors="coerce")

    # -------------------------------
    # Per-GCB3 direct report counts
    # -------------------------------
    per_gcb3_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 3]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb3_counts["Manager ID"] = per_gcb3_counts["Manager ID"].astype(str).str.strip()
    per_gcb3_counts["GCB3 Name"] = per_gcb3_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb3 = per_gcb3_counts[["GCB3 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb3.columns = ["GCB3 Name", "GCB3 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB3", per_gcb3))

    # -------------------------------
    # Per-GCB4 direct report counts
    # -------------------------------
    per_gcb4_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 4]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb4_counts["Manager ID"] = per_gcb4_counts["Manager ID"].astype(str).str.strip()
    per_gcb4_counts["GCB4 Name"] = per_gcb4_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb4 = per_gcb4_counts[["GCB4 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb4.columns = ["GCB4 Name", "GCB4 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB4", per_gcb4))

    # -------------------------------
    # WRITE OUTPUT
    # -------------------------------
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        # Write summaries stacked in one sheet
        start_row = 0
        for title, df in summary_tables:
            pd.DataFrame([[title]]).to_excel(writer, index=False, header=False,
                                             sheet_name="Summary", startrow=start_row)
            start_row += 1
            df.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(df) + 2

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


In [None]:
# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager."""
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row[MGR_NAME_COL],
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": None,
            "Manager Name": None,
            "Manager GCB": None,
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # Exceptions: employees whose manager ID not in employee list
    all_emp_ids = set(enriched[EMP_ID_COL])
    missing_mgr_ids = set(enriched[MGR_ID_COL]) - all_emp_ids
    for _, row in enriched[enriched[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if row[GCB_COL] != "MD":
            exceptions.append(row.to_dict())

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Single consolidated Summary sheet --------
      # -------- Single consolidated Summary sheet (REPLACE EXISTING SUMMARY BLOCK WITH THIS) --------
    summary_tables = []

    # overall counts
    summary = pd.DataFrame([{
        "Metric": "Total Employees", "Value": len(enriched)
    }, {
        "Metric": "Total in Hierarchy", "Value": hierarchy_df[EMP_ID_COL].nunique()
    }, {
        "Metric": "Missing Managers Count", "Value": len(exceptions_df)
    }])
    summary_tables.append(("Overall Summary", summary))

    # per-MD
    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]
    summary_tables.append(("Headcount per MD", per_md))

    # ---- Prepare a safe manager-name map from enriched (string keys) ----
    manager_name_map = {}
    if EMP_ID_COL in enriched.columns and EMP_NAME_COL in enriched.columns:
        mgr_df = enriched[[EMP_ID_COL, EMP_NAME_COL]].drop_duplicates().copy()
        mgr_df[EMP_ID_COL] = mgr_df[EMP_ID_COL].astype(str).str.strip()
        mgr_df[EMP_NAME_COL] = mgr_df[EMP_NAME_COL].astype(str).str.strip()
        manager_name_map = mgr_df.set_index(EMP_ID_COL)[EMP_NAME_COL].to_dict()

    # Make sure Manager GCB column exists in hierarchy_df (if not, create safe NA column)
    if "Manager GCB" not in hierarchy_df.columns:
        hierarchy_df["Manager GCB"] = pd.NA

    # Convert Manager GCB to numeric where possible for correct matching (coerce errors)
    mgr_gcb_numeric = pd.to_numeric(hierarchy_df["Manager GCB"], errors="coerce")

    # -------------------------------
    # Per-GCB3: direct report counts
    # -------------------------------
    per_gcb3_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 3]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )

    # normalize Manager ID as string to map reliably
    per_gcb3_counts["Manager ID"] = per_gcb3_counts["Manager ID"].astype(str).str.strip()
    per_gcb3_counts["GCB3 Name"] = per_gcb3_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb3 = per_gcb3_counts[["GCB3 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb3.columns = ["GCB3 Name", "GCB3 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB3", per_gcb3))

    # -------------------------------
    # Per-GCB4: direct report counts
    # -------------------------------
    per_gcb4_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 4]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb4_counts["Manager ID"] = per_gcb4_counts["Manager ID"].astype(str).str.strip()
    per_gcb4_counts["GCB4 Name"] = per_gcb4_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb4 = per_gcb4_counts[["GCB4 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb4.columns = ["GCB4 Name", "GCB4 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB4", per_gcb4))

    # -------------------------------
    # WRITE OUTPUT (Hierarchy + Missing Managers + ONE Summary sheet stacked)
    # -------------------------------
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        # Write all summaries stacked in one sheet with title rows
        start_row = 0
        for title, df in summary_tables:
            # header/title row
            pd.DataFrame([[title]]).to_excel(writer, index=False, header=False,
                                             sheet_name="Summary", startrow=start_row)
            start_row += 1
            df.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(df) + 2  # leave a gap

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


In [2]:
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from pptx.enum.shapes import MSO_SHAPE

# Create presentation
prs = Presentation()
slide_layout = prs.slide_layouts[6]  # blank slide
slide = prs.slides.add_slide(slide_layout)

# Title
left, top, width, height = Inches(0.5), Inches(0.2), Inches(9), Inches(1)
textbox = slide.shapes.add_textbox(left, top, width, height)
tf = textbox.text_frame
tf.text = "Cost Template Automation – Saving 144 Hours Annually"
p = tf.paragraphs[0]
p.font.size = Pt(28)
p.font.bold = True
p.font.color.rgb = RGBColor(0, 51, 102)

# Process flow boxes
process_steps = [
    ("Step 1: Data Preparation\nTool: CostTemplate_Data_Prep_Tool_KS", 0.5),
    ("Step 2: Report Generation\nTool: Cost_Templates_Gen_Tool_KS", 3.5),
    ("Step 3: Finalization\nTool: PasteAsValuesUtility", 6.5)
]

for text, left_in in process_steps:
    shape = slide.shapes.add_shape(
        MSO_SHAPE.ROUNDED_RECTANGLE,
        Inches(left_in), Inches(2), Inches(2.8), Inches(1.5)
    )
    shape.fill.solid()
    shape.fill.fore_color.rgb = RGBColor(91, 155, 213)
    shape.text = text
    for p in shape.text_frame.paragraphs:
        p.font.size = Pt(14)
        p.font.color.rgb = RGBColor(255, 255, 255)
        p.alignment = 1  # center

# Add arrows between steps
for i in range(2):
    slide.shapes.add_connector(
        1, Inches(3.3 + i*3), Inches(2.75), Inches(3.5 + i*3), Inches(2.75)
    )

# Benefits box
shape = slide.shapes.add_shape(
    MSO_SHAPE.RECTANGLE,
    Inches(0.5), Inches(4), Inches(8.5), Inches(2)
)
shape.fill.solid()
shape.fill.fore_color.rgb = RGBColor(237, 125, 49)
shape.text = (
    "Benefits:\n"
    "• Saves 12 hours per month (~144 hours annually)\n"
    "• Eliminates manual TM1 refreshes & errors\n"
    "• Ensures consistent, validated dashboards\n"
    "• Ready-to-share lightweight reports"
)
for p in shape.text_frame.paragraphs:
    p.font.size = Pt(16)
    p.font.color.rgb = RGBColor(255, 255, 255)

# Save file
prs.save("Cost_Automation_OnePager.pptx")
print("File Saved")

File Saved


In [None]:
import pandas as pd

# -------------------------
# Constants
# -------------------------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"   # must exist in monthly.csv
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"


# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase_one(gha_file, monthly_file, output_file="phase1_enriched.xlsx"):
    """Enrich monthly CSV with GHA details."""

    # Read monthly file (CSV may have special encoding)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    gha = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Strip column names
    monthly.columns = monthly.columns.str.strip()
    gha.columns = gha.columns.str.strip()

    # Select needed columns from GHA (add more here if needed)
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Merge monthly + gha
    merged = monthly.merge(
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Add Manager GCB by merging again on Manager ID
    mgr_gcb = gha_subset[[EMP_ID_COL, GCB_COL]].rename(
        columns={EMP_ID_COL: MGR_ID_COL, GCB_COL: "Manager GCB"}
    )
    merged = merged.merge(mgr_gcb, on=MGR_ID_COL, how="left")

    # Capture missing GHA matches
    missing = merged[merged[GCB_COL].isna()]

    # Save Phase 1
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 completed → {output_file}")
    return merged


# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager."""
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row[MGR_NAME_COL],
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": None,
            "Manager Name": None,
            "Manager GCB": None,
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # Exceptions: employees whose manager ID not in employee list
    all_emp_ids = set(enriched[EMP_ID_COL])
    missing_mgr_ids = set(enriched[MGR_ID_COL]) - all_emp_ids
    for _, row in enriched[enriched[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if row[GCB_COL] != "MD":
            exceptions.append(row.to_dict())

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Summary sheet --------
    summary = {}
    summary["Total Employees"] = len(enriched)
    summary["Total in Hierarchy"] = hierarchy_df[EMP_ID_COL].nunique()
    summary["Missing Managers Count"] = len(exceptions_df)

    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]

    per_gcb3 = hierarchy_df[hierarchy_df[GCB_COL] == 3] \
        .groupby(EMP_NAME_COL)[EMP_ID_COL].count().reset_index()
    per_gcb3.columns = ["GCB3 Name", "Direct Reports"]

    per_gcb4 = hierarchy_df[hierarchy_df[GCB_COL] == 4] \
        .groupby(EMP_NAME_COL)[EMP_ID_COL].count().reset_index()
    per_gcb4.columns = ["GCB4 Name", "Direct Reports"]

    # Save all outputs
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        pd.DataFrame([summary]).to_excel(writer, index=False, sheet_name="Summary")
        per_md.to_excel(writer, index=False, sheet_name="MD Headcount")
        per_gcb3.to_excel(writer, index=False, sheet_name="GCB3 Reports")
        per_gcb4.to_excel(writer, index=False, sheet_name="GCB4 Reports")

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    gha_file = "GHA.xlsx"          # input GHA file
    csv_file = "Monthly.csv"       # input monthly file
    output_file_phase1 = "phase1_enriched.xlsx"
    final_output = "hierarchy_report.xlsx"

    print("Running Phase 1...")
    phase_one(gha_file, csv_file, output_file_phase1)

    print("Running Phase 2...")
    phase_two(output_file_phase1, final_output)

    print("✅ Processing complete!")


In [None]:
import pandas as pd

# -------------------------
# Constants
# -------------------------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"   # must exist in monthly.csv
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"

# -------------------------
# Phase 1: Enrichment
# -------------------------
def enrich_monthly_with_gha(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    """Enrich monthly CSV with GHA details."""

    # Read monthly file (CSV may have special encoding)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    gha = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Strip column names
    monthly.columns = monthly.columns.str.strip()
    gha.columns = gha.columns.str.strip()

    # Select needed columns from GHA (extend as needed)
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Merge monthly + gha
    merged = monthly.merge(
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Add Manager GCB by merging again on Manager ID
    mgr_gcb = gha_subset[[EMP_ID_COL, GCB_COL]].rename(
        columns={EMP_ID_COL: MGR_ID_COL, GCB_COL: "Direct Manager GCB"}
    )
    merged = merged.merge(mgr_gcb, on=MGR_ID_COL, how="left")

    # Capture missing GHA matches
    missing = merged[merged[GCB_COL].isna()]

    # Save Phase 1
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 completed → {output_file}")
    return merged


# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def build_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    """Build flattened hierarchy with MD → GCB3 → Managers → Employees."""

    df_lookup = enriched_df.set_index(EMP_ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[EMP_ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                row = {
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": "",
                    "Direct Manager Name": emp[MGR_NAME_COL],
                }
                row.update(emp.to_dict())   # keep all monthly+gha cols
                final_rows.append(row)
            continue

        # Process each GCB3
        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[EMP_ID_COL]

            # Add GCB3’s own row
            row = {
                "MD Name": md[EMP_NAME_COL],
                "MD ID": md[EMP_ID_COL],
                "GCB3 Name": gcb3[EMP_NAME_COL],
                "Direct Manager Name": gcb3[MGR_NAME_COL],
            }
            row.update(gcb3.to_dict())
            final_rows.append(row)

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[EMP_ID_COL]

                row = {
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": gcb3[EMP_NAME_COL],
                    "Direct Manager Name": rm[MGR_NAME_COL],
                }
                row.update(rm.to_dict())
                final_rows.append(row)

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    row = {
                        "MD Name": md[EMP_NAME_COL],
                        "MD ID": md[EMP_ID_COL],
                        "GCB3 Name": gcb3[EMP_NAME_COL],
                        "Direct Manager Name": rm[EMP_NAME_COL],
                    }
                    row.update(emp.to_dict())
                    final_rows.append(row)

    # Exceptions: employees with missing managers (excluding MDs)
    all_emp_ids = set(enriched_df[EMP_ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Direct Manager Name", EMP_NAME_COL],
        inplace=True,
        na_position="last"
    )

    # -------------------------
    # Build summary sheet
    # -------------------------
    summary_data = {
        "Metric": [
            "Total employees in enriched file",
            "Employees missing in GHA",
            "Employees with missing manager (non-MD)"
        ],
        "Count": [
            len(enriched_df),
            sum(enriched_df[GCB_COL].isna()),
            len(exceptions_df)
        ]
    }
    summary_df = pd.DataFrame(summary_data)

    # Per MD
    per_md = final_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index(name="Employees under MD")
    # Per GCB3
    per_gcb3 = final_df.groupby("GCB3 Name")[EMP_ID_COL].nunique().reset_index(name="Employees under GCB3")

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")
        summary_df.to_excel(writer, index=False, sheet_name="Summary")
        per_md.to_excel(writer, index=False, sheet_name="Summary", startrow=len(summary_df)+2)
        per_gcb3.to_excel(writer, index=False, sheet_name="Summary", startrow=len(summary_df)+len(per_md)+5)

    print(f"✅ Phase 2 completed → {output_file}")
    return final_df, exceptions_df, summary_df


In [None]:
import pandas as pd

# -------------------------
# Constants
# -------------------------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"   # must exist in monthly.csv
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"

# -------------------------
# Phase 1: Enrichment
# -------------------------
def enrich_monthly_with_gha(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    """Enrich monthly CSV with GHA details."""

    # Read monthly file (CSV may have special encoding)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    gha = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Strip column names
    monthly.columns = monthly.columns.str.strip()
    gha.columns = gha.columns.str.strip()

    # Select needed columns from GHA (add more here if needed)
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Merge monthly + gha
    merged = monthly.merge(
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Add Manager GCB by merging again on Manager ID
    mgr_gcb = gha_subset[[EMP_ID_COL, GCB_COL]].rename(
        columns={EMP_ID_COL: MGR_ID_COL, GCB_COL: "Manager GCB"}
    )
    merged = merged.merge(mgr_gcb, on=MGR_ID_COL, how="left")

    # Capture missing GHA matches
    missing = merged[merged[GCB_COL].isna()]

    # Save Phase 1
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 completed → {output_file}")
    return merged


# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def build_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    """Build flattened hierarchy with MD → GCB3 → Managers → Employees."""

    df_lookup = enriched_df.set_index(EMP_ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[EMP_ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": "",
                    "Reporting Manager Name": emp[MGR_NAME_COL],
                    "Reporting Manager GCB": emp.get("Manager GCB", ""),
                    **emp.to_dict()
                })
            continue

        # Process each GCB3
        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[EMP_ID_COL]

            # Add GCB3’s own row
            final_rows.append({
                "MD Name": md[EMP_NAME_COL],
                "MD ID": md[EMP_ID_COL],
                "GCB3 Name": gcb3[EMP_NAME_COL],
                "Reporting Manager Name": gcb3[MGR_NAME_COL],
                "Reporting Manager GCB": gcb3.get("Manager GCB", ""),
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[EMP_ID_COL]

                final_rows.append({
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": gcb3[EMP_NAME_COL],
                    "Reporting Manager Name": rm[EMP_NAME_COL],
                    "Reporting Manager GCB": rm.get(GCB_COL, ""),
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[EMP_NAME_COL],
                        "MD ID": md[EMP_ID_COL],
                        "GCB3 Name": gcb3[EMP_NAME_COL],
                        "Reporting Manager Name": rm[EMP_NAME_COL],
                        "Reporting Manager GCB": rm.get(GCB_COL, ""),
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (excluding MDs)
    all_emp_ids = set(enriched_df[EMP_ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", EMP_NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 completed → {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = enrich_monthly_with_gha(monthly_file, gha_file)
    build_hierarchy(enriched)


In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [17]:
import pandas as pd

# Sample data
data = [
    ["101", "Alice",    "5", "201", "Bob"],
    ["201", "Bob",      "4", "301", "Charlie"],
    ["301", "Charlie",  "3", "401", "Dana"],
    ["401", "Dana",     "2", None,  None],
    ["302", "Eva",      "5", "201", "Bob"],
    ["304", "Kirti",    "5", "305", "Inish"],
    ["307", "Anshul",   "4", "305", "Inish"],
    ["308", "Sowmya",   "5", "307", "Anshul"],
    ["305", "Inish",    "3", "306", "Vilma"],
    ["306", "Vilma",    "2", "504", "Abhishek"],
]

columns = [
    "Employee ID", "Employee Name", "Global Career Band",
    "Entity Manager Employee ID", "Entity Manager Employee Name"
]

df = pd.DataFrame(data, columns=columns)

# Create lookup dictionary
employee_lookup = df.set_index("Employee ID").to_dict("index")

# Add GCB 4 and GCB 3 output columns
df["Entity Manager Employee ID_GCB 4"] = ""
df["Entity Manager Employee Name_GCB 4"] = ""
df["Entity Manager Employee ID_GCB 3"] = ""
df["Entity Manager Employee Name_GCB 3"] = ""

# 🔁 Updated logic: Climb upward until GCB 4 & 3 found
def trace_managers_gcb_4_and_3(start_id):
    gcb4 = None
    gcb3 = None
    visited = []

    current_id = start_id
    while current_id and current_id in employee_lookup:
        visited.append(current_id)
        manager = employee_lookup[current_id]
        gcb = str(manager.get("Global Career Band", "")).strip()

        if not gcb4 and gcb == "4":
            gcb4 = (current_id, manager.get("Employee Name"))
        elif not gcb3 and gcb == "3":
            gcb3 = (current_id, manager.get("Employee Name"))
        
        # Stop if both found
        if gcb4 and gcb3:
            break

        current_id = manager.get("Entity Manager Employee ID")

    return gcb4, gcb3

# Apply to each row
for i in df.index:
    row = df.loc[i]
    mgr_id = row["Entity Manager Employee ID"]

    if pd.notna(mgr_id):
        gcb4, gcb3 = trace_managers_gcb_4_and_3(mgr_id)

        if gcb4:
            df.loc[i, "Entity Manager Employee ID_GCB 4"] = gcb4[0]
            df.loc[i, "Entity Manager Employee Name_GCB 4"] = gcb4[1]

        if gcb3:
            df.loc[i, "Entity Manager Employee ID_GCB 3"] = gcb3[0]
            df.loc[i, "Entity Manager Employee Name_GCB 3"] = gcb3[1]

# ✅ Final output
print(df.to_string(index=False))


Employee ID Employee Name Global Career Band Entity Manager Employee ID Entity Manager Employee Name Entity Manager Employee ID_GCB 4 Entity Manager Employee Name_GCB 4 Entity Manager Employee ID_GCB 3 Entity Manager Employee Name_GCB 3
        101         Alice                  5                        201                          Bob                              201                                Bob                              301                            Charlie
        201           Bob                  4                        301                      Charlie                                                                                                  301                            Charlie
        301       Charlie                  3                        401                         Dana                                                                                                                                        
        401          Dana                  2        

In [18]:
df.to_excel("g1.xlsx", index=False)

In [None]:
wor