In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [17]:
import pandas as pd

# Sample data
data = [
    ["101", "Alice",    "5", "201", "Bob"],
    ["201", "Bob",      "4", "301", "Charlie"],
    ["301", "Charlie",  "3", "401", "Dana"],
    ["401", "Dana",     "2", None,  None],
    ["302", "Eva",      "5", "201", "Bob"],
    ["304", "Kirti",    "5", "305", "Inish"],
    ["307", "Anshul",   "4", "305", "Inish"],
    ["308", "Sowmya",   "5", "307", "Anshul"],
    ["305", "Inish",    "3", "306", "Vilma"],
    ["306", "Vilma",    "2", "504", "Abhishek"],
]

columns = [
    "Employee ID", "Employee Name", "Global Career Band",
    "Entity Manager Employee ID", "Entity Manager Employee Name"
]

df = pd.DataFrame(data, columns=columns)

# Create lookup dictionary
employee_lookup = df.set_index("Employee ID").to_dict("index")

# Add GCB 4 and GCB 3 output columns
df["Entity Manager Employee ID_GCB 4"] = ""
df["Entity Manager Employee Name_GCB 4"] = ""
df["Entity Manager Employee ID_GCB 3"] = ""
df["Entity Manager Employee Name_GCB 3"] = ""

# 🔁 Updated logic: Climb upward until GCB 4 & 3 found
def trace_managers_gcb_4_and_3(start_id):
    gcb4 = None
    gcb3 = None
    visited = []

    current_id = start_id
    while current_id and current_id in employee_lookup:
        visited.append(current_id)
        manager = employee_lookup[current_id]
        gcb = str(manager.get("Global Career Band", "")).strip()

        if not gcb4 and gcb == "4":
            gcb4 = (current_id, manager.get("Employee Name"))
        elif not gcb3 and gcb == "3":
            gcb3 = (current_id, manager.get("Employee Name"))
        
        # Stop if both found
        if gcb4 and gcb3:
            break

        current_id = manager.get("Entity Manager Employee ID")

    return gcb4, gcb3

# Apply to each row
for i in df.index:
    row = df.loc[i]
    mgr_id = row["Entity Manager Employee ID"]

    if pd.notna(mgr_id):
        gcb4, gcb3 = trace_managers_gcb_4_and_3(mgr_id)

        if gcb4:
            df.loc[i, "Entity Manager Employee ID_GCB 4"] = gcb4[0]
            df.loc[i, "Entity Manager Employee Name_GCB 4"] = gcb4[1]

        if gcb3:
            df.loc[i, "Entity Manager Employee ID_GCB 3"] = gcb3[0]
            df.loc[i, "Entity Manager Employee Name_GCB 3"] = gcb3[1]

# ✅ Final output
print(df.to_string(index=False))


Employee ID Employee Name Global Career Band Entity Manager Employee ID Entity Manager Employee Name Entity Manager Employee ID_GCB 4 Entity Manager Employee Name_GCB 4 Entity Manager Employee ID_GCB 3 Entity Manager Employee Name_GCB 3
        101         Alice                  5                        201                          Bob                              201                                Bob                              301                            Charlie
        201           Bob                  4                        301                      Charlie                                                                                                  301                            Charlie
        301       Charlie                  3                        401                         Dana                                                                                                                                        
        401          Dana                  2        

In [18]:
df.to_excel("g1.xlsx", index=False)

In [None]:
wor