In [None]:
import pandas as pd

# ---------- Config (change file names here) ----------
monthly_file = "monthly.csv"         # your monthly CSV
gha_file = "GHA.xlsx"                # your GHA Excel
gha_sheet = "Headcount - Employee Detail"
output_file = "final_hierarchy_with_full_columns.xlsx"

# ---------- Column constants (adjust if your data uses different names) ----------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"      # may or may not exist in monthly CSV
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"

# New clearer manager columns
IMMEDIATE_MANAGER_NAME = "Immediate Manager Name"
IMMEDIATE_MANAGER_GCB = "Immediate Manager GCB"

# ---------- Phase 1: enrich monthly with GHA ----------
def phase1_enrich(monthly_file, gha_file, gha_sheet="Headcount - Employee Detail"):
    # Read monthly CSV (handle common encodings)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    monthly.columns = monthly.columns.str.strip()

    # Read GHA Excel
    gha = pd.read_excel(gha_file, sheet_name=gha_sheet)
    gha.columns = gha.columns.str.strip()

    # Normalize key columns in GHA (remove stray spaces and set types)
    gha[EMP_ID_COL] = gha[EMP_ID_COL].astype(str).str.strip()
    gha[EMP_NAME_COL] = gha[EMP_NAME_COL].astype(str).str.strip()
    gha[GCB_COL] = gha[GCB_COL].astype(str).str.strip()

    # === Choose which GHA columns you want pulled. Add columns here as needed ===
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Ensure monthly Employee ID is string and stripped
    if EMP_ID_COL in monthly.columns:
        monthly[EMP_ID_COL] = monthly[EMP_ID_COL].astype(str).str.strip()
    if MGR_NAME_COL in monthly.columns:
        monthly[MGR_NAME_COL] = monthly[MGR_NAME_COL].astype(str).str.strip()
    if MGR_ID_COL in monthly.columns:
        monthly[MGR_ID_COL] = monthly[MGR_ID_COL].astype(str).str.strip()

    # 1) Merge employee-level GHA info (on Employee ID)
    #    (if monthly lacks Employee ID, you'll need a different key — usually Employee ID exists)
    enriched = pd.merge(
        monthly,
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Standardize GCB column in enriched
    if GCB_COL in enriched.columns:
        enriched[GCB_COL] = enriched[GCB_COL].astype(str).str.strip().str.upper()

    # 2) Create manager lookup maps from GHA
    #    map manager name -> manager id & manager name -> manager GCB (if monthly only has manager name)
    mgr_name_to_id = gha.set_index(EMP_NAME_COL)[EMP_ID_COL].to_dict()
    mgr_name_to_gcb = gha.set_index(EMP_NAME_COL)[GCB_COL].to_dict()

    # 3) If monthly already contains MGR_ID_COL, use it; otherwise, try to derive manager id from manager name
    if MGR_ID_COL not in enriched.columns:
        # try derive it from manager name
        if MGR_NAME_COL in enriched.columns:
            enriched[MGR_ID_COL] = enriched[MGR_NAME_COL].map(mgr_name_to_id)
        else:
            # no manager name or manager id in monthly — then MGR_ID_COL will be missing
            enriched[MGR_ID_COL] = pd.NA
    else:
        # ensure type/strip
        enriched[MGR_ID_COL] = enriched[MGR_ID_COL].astype(str).str.strip()

    # 4) Add Manager GCB to enriched dataset:
    #    - if we have manager name mapping -> use it
    #    - else if we have manager id -> map via gha_subset
    if MGR_NAME_COL in enriched.columns:
        # first try by manager name
        enriched[IMMEDIATE_MANAGER_GCB] = enriched[MGR_NAME_COL].map(mgr_name_to_gcb)
    else:
        enriched[IMMEDIATE_MANAGER_GCB] = pd.NA

    # If there are still missing manager GCB values but we have manager IDs, try mapping by ID
    if MGR_ID_COL in enriched.columns:
        id_to_gcb = gha.set_index(EMP_ID_COL)[GCB_COL].to_dict()
        enriched[IMMEDIATE_MANAGER_GCB] = enriched[IMMEDIATE_MANAGER_GCB].fillna(
            enriched[MGR_ID_COL].map(id_to_gcb)
        )

    # Save phase1 for inspection
    enriched.to_excel("phase1_enriched.xlsx", index=False)
    print("Phase 1 saved -> phase1_enriched.xlsx")
    return enriched


# ---------- Phase 2: build flattened hierarchy and include all columns ----------
def phase2_build_full(enriched_df, output_file=output_file):
    # normalize GCB values as strings
    enriched_df[GCB_COL] = enriched_df[GCB_COL].astype(str).str.strip().str.upper()

    final_rows = []
    exceptions = []

    # Helper: safe get by id if needed
    df_by_id = enriched_df.set_index(EMP_ID_COL) if EMP_ID_COL in enriched_df.columns else None

    # Ensure we have a list of column names (phase1 columns) to preserve full set
    full_cols = list(enriched_df.columns)

    # Build rows: we will produce a row dict that merges hierarchy tags + the full employee dict
    def make_row_with_hierarchy_tags(emp_series, md_row=None, gcb3_row=None, immediate_mgr_row=None):
        """
        emp_series: pandas Series representing the employee (or manager) we're adding as a row
        md_row: Series of MD (if available)
        gcb3_row: Series of GCB3 (if available)
        immediate_mgr_row: Series of immediate manager used in this context (if available)
        """
        row = emp_series.to_dict()  # start with full enriched columns

        # Add/overwrite hierarchy columns (these are top-level tags)
        row["MD Name"] = md_row[EMP_NAME_COL] if md_row is not None else ""
        row["MD ID"] = md_row[EMP_ID_COL] if md_row is not None else ""

        row["GCB3 Name"] = gcb3_row[EMP_NAME_COL] if gcb3_row is not None else ""
        row["GCB3 ID"] = gcb3_row[EMP_ID_COL] if gcb3_row is not None else ""

        # Immediate manager info (the direct manager for this row)
        if immediate_mgr_row is not None:
            row[IMMEDIATE_MANAGER_NAME] = immediate_mgr_row[EMP_NAME_COL]
            row[IMMEDIATE_MANAGER_GCB] = str(immediate_mgr_row.get(GCB_COL, "")).strip()
            # also provide manager id if we have it
            row[MGR_ID_COL] = immediate_mgr_row.get(EMP_ID_COL, "")
        else:
            # fallback: if emp_series has manager columns (from phase1), use them
            row[IMMEDIATE_MANAGER_NAME] = emp_series.get(MGR_NAME_COL, "")
            row[IMMEDIATE_MANAGER_GCB] = emp_series.get("Manager GCB", "")

        return row

    # find all MDs (roots)
    md_rows = enriched_df[enriched_df[GCB_COL] == "MD"]

    # If MDs are not present in the enriched data but the monthly file is scoped to an MD,
    # you must ensure that MD appears in the enriched_df (manual add or ensure GHA has them).
    # Here we proceed with MDs present in enriched_df
    for _, md in md_rows.iterrows():
        md_id = md[EMP_ID_COL]

        # Add MD as row (so MD appears at top)
        final_rows.append(make_row_with_hierarchy_tags(md, md_row=md, gcb3_row=None, immediate_mgr_row=None))

        # Get direct reports to MD
        direct_to_md = enriched_df[enriched_df[MGR_ID_COL] == md_id]

        # Among direct reports find GCB3s
        gcb3s = direct_to_md[direct_to_md[GCB_COL] == "3"]

        # 1) For each GCB3: add the GCB3 row, then managers under that GCB3 and their employees
        for _, gcb3 in gcb3s.iterrows():
            # Add the GCB3's own row (MD and GCB3 fields set)
            final_rows.append(make_row_with_hierarchy_tags(gcb3, md_row=md, gcb3_row=gcb3, immediate_mgr_row=md))

            # managers under this GCB3 (these are people whose manager == gcb3)
            mgrs_under_gcb3 = enriched_df[enriched_df[MGR_ID_COL] == gcb3[EMP_ID_COL]]

            for _, mgr in mgrs_under_gcb3.iterrows():
                # add manager row (immediate manager for these rows is the gcb3)
                final_rows.append(make_row_with_hierarchy_tags(mgr, md_row=md, gcb3_row=gcb3, immediate_mgr_row=gcb3))

                # add employees who report to this manager
                emps_under_mgr = enriched_df[enriched_df[MGR_ID_COL] == mgr[EMP_ID_COL]]
                for _, emp in emps_under_mgr.iterrows():
                    final_rows.append(make_row_with_hierarchy_tags(emp, md_row=md, gcb3_row=gcb3, immediate_mgr_row=mgr))

        # 2) Add direct reports to MD that are NOT GCB3 (they directly report to MD)
        direct_not_gcb3 = direct_to_md[direct_to_md[GCB_COL] != "3"]
        for _, emp in direct_not_gcb3.iterrows():
            final_rows.append(make_row_with_hierarchy_tags(emp, md_row=md, gcb3_row=None, immediate_mgr_row=md))

    # 3) Exceptions: employees whose manager id/name is missing in the enriched dataset
    known_mgr_ids = set(enriched_df[EMP_ID_COL].astype(str).tolist())
    # if manager id column missing derive using manager name (cover both cases)
    if MGR_ID_COL in enriched_df.columns and enriched_df[MGR_ID_COL].notna().any():
        missing_mgr_mask = ~enriched_df[MGR_ID_COL].astype(str).isin(known_mgr_ids) & enriched_df[MGR_ID_COL].notna()
    else:
        # fallback by manager name
        known_mgr_names = set(enriched_df[EMP_NAME_COL].astype(str).tolist())
        missing_mgr_mask = ~enriched_df[MGR_NAME_COL].astype(str).isin(known_mgr_names) & enriched_df[MGR_NAME_COL].notna()

    for _, emp in enriched_df[missing_mgr_mask].iterrows():
        # skip MDs themselves
        if str(emp.get(GCB_COL, "")).strip().upper() == "MD":
            continue
        exceptions.append(emp.to_dict())

    final_df = pd.DataFrame(final_rows)

    # Make sure all original enriched columns are present in final_df
    # and place hierarchy tags at front in a stable order.
    hierarchy_tags = ["MD Name", "MD ID", "GCB3 Name", "GCB3 ID", IMMEDIATE_MANAGER_NAME, IMMEDIATE_MANAGER_GCB]
    final_columns = hierarchy_tags + [c for c in full_cols if c not in hierarchy_tags]
    # Keep only existing columns (some might be missing)
    final_columns = [c for c in final_columns if c in final_df.columns]
    final_df = final_df[final_columns]

    exceptions_df = pd.DataFrame(exceptions)
    # Sort final_df
    if not final_df.empty:
        final_df.sort_values(by=["MD Name", "GCB3 Name", IMMEDIATE_MANAGER_NAME, EMP_NAME_COL], inplace=True, na_position="last")

    # Summary counts
    total_employees = len(enriched_df)
    missing_gha = enriched_df[GCB_COL].isna().sum() if GCB_COL in enriched_df.columns else 0
    missing_mgrs = len(exceptions_df)

    md_counts = final_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index(name="Count")
    gcb3_counts = final_df.groupby("GCB3 Name")[EMP_ID_COL].nunique().reset_index(name="Count")
    mgr_counts = final_df.groupby(IMMEDIATE_MANAGER_NAME)[EMP_ID_COL].nunique().reset_index(name="Count")

    # Save everything to the workbook
    with pd.ExcelWriter(output_file, engine="openpyxl") as w:
        final_df.to_excel(w, sheet_name="Hierarchy", index=False)
        if not exceptions_df.empty:
            exceptions_df.to_excel(w, sheet_name="Missing Managers", index=False)
        # Summary
        summary = pd.DataFrame({
            "Metric": ["Total employees in monthly file", "Missing in GHA (no GCB)", "Employees w/ missing manager"],
            "Value": [total_employees, int(missing_gha), int(missing_mgrs)]
        })
        summary.to_excel(w, sheet_name="Summary", index=False)
        md_counts.to_excel(w, sheet_name="By MD", index=False)
        gcb3_counts.to_excel(w, sheet_name="By GCB3", index=False)
        mgr_counts.to_excel(w, sheet_name="By Manager", index=False)

    print("Phase 2 saved ->", output_file)
    return final_df, exceptions_df

# ---------- run pipeline ----------
if __name__ == "__main__":
    enriched = phase1_enrich(monthly_file, gha_file, gha_sheet)
    final_df, exceptions_df = phase2_build_full(enriched, output_file)


In [None]:
import pandas as pd

# -------------------------
# Constants
# -------------------------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"   # must exist in monthly.csv
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"

# -------------------------
# Phase 1: Enrichment
# -------------------------
def enrich_monthly_with_gha(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    """Enrich monthly CSV with GHA details."""

    # Read monthly file (CSV may have special encoding)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    gha = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Strip column names
    monthly.columns = monthly.columns.str.strip()
    gha.columns = gha.columns.str.strip()

    # Select needed columns from GHA (add more here if needed)
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Merge monthly + gha
    merged = monthly.merge(
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Add Manager GCB by merging again on Manager ID
    mgr_gcb = gha_subset[[EMP_ID_COL, GCB_COL]].rename(
        columns={EMP_ID_COL: MGR_ID_COL, GCB_COL: "Manager GCB"}
    )
    merged = merged.merge(mgr_gcb, on=MGR_ID_COL, how="left")

    # Capture missing GHA matches
    missing = merged[merged[GCB_COL].isna()]

    # Save Phase 1
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 completed → {output_file}")
    return merged


# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def build_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    """Build flattened hierarchy with MD → GCB3 → Managers → Employees."""

    df_lookup = enriched_df.set_index(EMP_ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[EMP_ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": "",
                    "Reporting Manager Name": emp[MGR_NAME_COL],
                    "Reporting Manager GCB": emp.get("Manager GCB", ""),
                    **emp.to_dict()
                })
            continue

        # Process each GCB3
        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[EMP_ID_COL]

            # Add GCB3’s own row
            final_rows.append({
                "MD Name": md[EMP_NAME_COL],
                "MD ID": md[EMP_ID_COL],
                "GCB3 Name": gcb3[EMP_NAME_COL],
                "Reporting Manager Name": gcb3[MGR_NAME_COL],
                "Reporting Manager GCB": gcb3.get("Manager GCB", ""),
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[EMP_ID_COL]

                final_rows.append({
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": gcb3[EMP_NAME_COL],
                    "Reporting Manager Name": rm[EMP_NAME_COL],
                    "Reporting Manager GCB": rm.get(GCB_COL, ""),
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[EMP_NAME_COL],
                        "MD ID": md[EMP_ID_COL],
                        "GCB3 Name": gcb3[EMP_NAME_COL],
                        "Reporting Manager Name": rm[EMP_NAME_COL],
                        "Reporting Manager GCB": rm.get(GCB_COL, ""),
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (excluding MDs)
    all_emp_ids = set(enriched_df[EMP_ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", EMP_NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 completed → {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = enrich_monthly_with_gha(monthly_file, gha_file)
    build_hierarchy(enriched)


In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [17]:
import pandas as pd

# Sample data
data = [
    ["101", "Alice",    "5", "201", "Bob"],
    ["201", "Bob",      "4", "301", "Charlie"],
    ["301", "Charlie",  "3", "401", "Dana"],
    ["401", "Dana",     "2", None,  None],
    ["302", "Eva",      "5", "201", "Bob"],
    ["304", "Kirti",    "5", "305", "Inish"],
    ["307", "Anshul",   "4", "305", "Inish"],
    ["308", "Sowmya",   "5", "307", "Anshul"],
    ["305", "Inish",    "3", "306", "Vilma"],
    ["306", "Vilma",    "2", "504", "Abhishek"],
]

columns = [
    "Employee ID", "Employee Name", "Global Career Band",
    "Entity Manager Employee ID", "Entity Manager Employee Name"
]

df = pd.DataFrame(data, columns=columns)

# Create lookup dictionary
employee_lookup = df.set_index("Employee ID").to_dict("index")

# Add GCB 4 and GCB 3 output columns
df["Entity Manager Employee ID_GCB 4"] = ""
df["Entity Manager Employee Name_GCB 4"] = ""
df["Entity Manager Employee ID_GCB 3"] = ""
df["Entity Manager Employee Name_GCB 3"] = ""

# 🔁 Updated logic: Climb upward until GCB 4 & 3 found
def trace_managers_gcb_4_and_3(start_id):
    gcb4 = None
    gcb3 = None
    visited = []

    current_id = start_id
    while current_id and current_id in employee_lookup:
        visited.append(current_id)
        manager = employee_lookup[current_id]
        gcb = str(manager.get("Global Career Band", "")).strip()

        if not gcb4 and gcb == "4":
            gcb4 = (current_id, manager.get("Employee Name"))
        elif not gcb3 and gcb == "3":
            gcb3 = (current_id, manager.get("Employee Name"))
        
        # Stop if both found
        if gcb4 and gcb3:
            break

        current_id = manager.get("Entity Manager Employee ID")

    return gcb4, gcb3

# Apply to each row
for i in df.index:
    row = df.loc[i]
    mgr_id = row["Entity Manager Employee ID"]

    if pd.notna(mgr_id):
        gcb4, gcb3 = trace_managers_gcb_4_and_3(mgr_id)

        if gcb4:
            df.loc[i, "Entity Manager Employee ID_GCB 4"] = gcb4[0]
            df.loc[i, "Entity Manager Employee Name_GCB 4"] = gcb4[1]

        if gcb3:
            df.loc[i, "Entity Manager Employee ID_GCB 3"] = gcb3[0]
            df.loc[i, "Entity Manager Employee Name_GCB 3"] = gcb3[1]

# ✅ Final output
print(df.to_string(index=False))


Employee ID Employee Name Global Career Band Entity Manager Employee ID Entity Manager Employee Name Entity Manager Employee ID_GCB 4 Entity Manager Employee Name_GCB 4 Entity Manager Employee ID_GCB 3 Entity Manager Employee Name_GCB 3
        101         Alice                  5                        201                          Bob                              201                                Bob                              301                            Charlie
        201           Bob                  4                        301                      Charlie                                                                                                  301                            Charlie
        301       Charlie                  3                        401                         Dana                                                                                                                                        
        401          Dana                  2        

In [18]:
df.to_excel("g1.xlsx", index=False)

In [None]:
wor