In [None]:
import pandas as pd
from collections import defaultdict

def title_case(col):
    return ' '.join(word.capitalize() for word in col.replace('_', ' ').split())

def append_md_info_with_trace(flattened_file_path, gha_file_path,
                              flattened_sheet="flattened",
                              gha_sheet="Headcount Employee - Detail"):
    # Load files
    flat_df = pd.read_excel(flattened_file_path, sheet_name=flattened_sheet)
    gha_df = pd.read_excel(gha_file_path, sheet_name=gha_sheet)

    # Normalize columns
    flat_df.columns = flat_df.columns.str.strip().str.lower()
    gha_df.columns = gha_df.columns.str.strip().str.lower()

    # Normalize IDs and GCBs
    for df in [flat_df, gha_df]:
        df['employee id'] = df['employee id'].astype(str).str.strip()
        df['entity manager employee id'] = df['entity manager employee id'].astype(str).str.strip()

    gha_df['global career band'] = gha_df['global career band'].fillna('').astype(str).str.upper().str.strip()

    # Build multi-record GHA lookup
    gha_multi_lookup = defaultdict(list)
    for _, row in gha_df.iterrows():
        emp_id = row['employee id']
        gha_multi_lookup[emp_id].append(row.to_dict())

    # Prepare output columns
    flat_df['md id'] = ''
    flat_df['md name'] = ''
    flat_df['md found?'] = 'No'
    flat_df['md trace comment'] = ''

    # Traverse each employee
    for idx, row in flat_df.iterrows():
        emp_id = row['employee id']
        current_mgr_id = str(row['entity manager employee id']).strip()
        visited = set()

        while current_mgr_id and current_mgr_id not in visited:
            visited.add(current_mgr_id)
            mgr_records = gha_multi_lookup.get(current_mgr_id, [])

            if not mgr_records:
                flat_df.at[idx, 'md trace comment'] = f"Entity Manager ID missing in GHA: {current_mgr_id}"
                break

            # Try to find MD among duplicates
            md_record = next((r for r in mgr_records if str(r.get('global career band', '')).strip().upper() == 'MD'), None)

            if md_record:
                flat_df.at[idx, 'md id'] = current_mgr_id
                flat_df.at[idx, 'md name'] = md_record.get('employee name', '')
                flat_df.at[idx, 'md found?'] = 'Yes'
                break

            # Continue upward using first available manager
            current_mgr_id = str(mgr_records[0].get('entity manager employee id', '')).strip()

    # Restore column names
    flat_df.columns = [title_case(col) for col in flat_df.columns]

    # Save output
    flat_df.to_excel("phase2_flattened_with_md_trace.xlsx", index=False)
    print("✅ MD tracing complete. Output saved as 'phase2_flattened_with_md_trace.xlsx'.")
    return flat_df

append_md_info_with_trace("phase1_enriched_monthly.xlsx", "Input/gha_sep15.xlsx")

In [None]:
# updaTED ALL PHASES
import pandas as pd

def enrich_gha_with_manager_gcb(gha_file_path, sheet_name="Headcount Employee - Detail"):
    # Load GHA data
    df = pd.read_excel(gha_file_path, sheet_name=sheet_name)
    df.columns = df.columns.str.strip().str.lower()

    # Normalize IDs
    df['employee id'] = df['employee id'].astype(str).str.strip()
    df['entity manager employee id'] = df['entity manager employee id'].astype(str).str.strip()

    # Build lookup for manager GCB
    gcb_lookup = df.set_index('employee id')['global career band'].to_dict()

    # Map Entity Manager GCB
    df['entity manager gcb'] = df['entity manager employee id'].map(gcb_lookup)

    # Optional: Normalize GCBs
    df['global career band'] = df['global career band'].fillna('').str.upper().str.strip()
    df['entity manager gcb'] = df['entity manager gcb'].fillna('').str.upper().str.strip()

    # Save enriched GHA
    df.to_excel("gha_enriched_with_mgr_gcb.xlsx", index=False)
    print("✅ Phase 1 complete: Entity Manager GCB added.")
    return df
enrich_gha_with_manager_gcb("Input/gha_sep15.xlsx")

In [None]:
# emp row duplicating due to manager change over the months
import pandas as pd

def get_reporting_tree(manager_id, df_lookup):
    reports = set()
    for emp_id, record in df_lookup.items():
        if record.get('entity manager employee id') == manager_id:
            reports.add(emp_id)
            reports.update(get_reporting_tree(emp_id, df_lookup))
    return reports

def phase3_analysis(flattened_file_path):
    # Load flattened data
    df = pd.read_excel(flattened_file_path, sheet_name="flattened")
    df.columns = df.columns.str.strip().str.lower()

    # Identify monthly columns
    monthly_cols = [col for col in df.columns if any(x in col for x in ['workstyle met?', 'avg days'])]

    # Static columns to preserve
    static_cols = ['employee id', 'employee name', 'global career band', 'md id', 'md name', 'md found?', 'entity manager employee id']

    # Normalize monthly columns
    df[monthly_cols] = df[monthly_cols].fillna('').astype(str).apply(lambda x: x.str.strip().str.lower())

    # Log duplicates
    dupes = df['employee id'].value_counts()
    multi_row_employees = dupes[dupes > 1].index.tolist()
    print(f"🔍 Employees with multiple rows due to manager changes: {len(multi_row_employees)}")

    # Save log
    pd.DataFrame({'employee id': multi_row_employees}).to_excel("phase3_multi_row_log.xlsx", index=False)

    # Merge rows per employee
    grouped = df.groupby('employee id')
    merged_rows = []
    for emp_id, group in grouped:
        merged = {'employee id': emp_id}
        for col in static_cols:
            if col in group.columns:
                merged[col] = group[col].dropna().iloc[-1] if not group[col].dropna().empty else ''
        for col in monthly_cols:
            values = group[col].replace('', pd.NA).dropna()
            merged[col] = values.iloc[0] if not values.empty else ''
        merged_rows.append(merged)

    df_merged = pd.DataFrame(merged_rows)

    # Build lookup
    df_lookup = df_merged.set_index('employee id').to_dict('index')

    # 1️⃣ Hierarchy Summary
    df_merged['global career band'] = df_merged['global career band'].fillna('').str.upper()
    df_merged['md name'] = df_merged['md name'].fillna('').str.strip()

    hierarchy_summary = []
    for md in df_merged['md name'].dropna().unique():
        md_df = df_merged[df_merged['md name'] == md]
        gcb3_df = md_df[md_df['global career band'] == 'GCB 3']
        for _, gcb3 in gcb3_df.iterrows():
            gcb3_id = gcb3['employee id']
            gcb3_name = gcb3['employee name']
            full_subtree = get_reporting_tree(gcb3_id, df_lookup)
            hierarchy_summary.append({
                'MD Name': md,
                'GCB3 ID': gcb3_id,
                'GCB3 Name': gcb3_name,
                'Total Reports Under GCB3': len(full_subtree)
            })
    hierarchy_df = pd.DataFrame(hierarchy_summary)

    # 2️⃣ Monthly Trend Summary
    trend_data = []
    for col in monthly_cols:
        if 'workstyle met?' not in col:
            continue
        month = col.split()[0]
        avg_col = f"{month} avg days at any office per week (with shrinkage)".lower().replace(' ', '_')
        if avg_col not in df_merged.columns:
            continue
        headcount = df_merged[avg_col].notna().sum()
        met_count = df_merged[col].str.strip().str.lower().eq('yes').sum()
        pct = round((met_count / headcount) * 100, 2) if headcount else 0
        trend_data.append({
            'Month': month,
            'Headcount': headcount,
            'Workstyle Met': met_count,
            'Percentage': pct
        })

    trend_df = pd.DataFrame(trend_data)
    trend_df['Month_dt'] = pd.to_datetime(trend_df['Month'], format='%b-%y')
    trend_df = trend_df.sort_values('Month_dt')
    trend_df['MoM Change (%)'] = trend_df['Percentage'].diff().round(2)

    # Save outputs
    with pd.ExcelWriter("phase3_trend_analysis.xlsx", engine='openpyxl') as writer:
        hierarchy_df.to_excel(writer, sheet_name="Hierarchy Summary", index=False)
        trend_df.drop(columns='Month_dt').to_excel(writer, sheet_name="Monthly Trend", index=False)

    print("✅ Phase 3 summary complete. Use draw_phase3_graph(trend_df) to visualize.")
    return trend_df

import matplotlib.pyplot as plt

def draw_phase3_graph(trend_df):
    plt.figure(figsize=(10, 5))
    plt.plot(trend_df['Month_dt'], trend_df['Percentage'], marker='o', color='blue')
    plt.title('Workstyle Met % Trend Over Time')
    plt.xlabel('Month')
    plt.ylabel('% Workstyle Met')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("workstyle_met_trend.png")
    plt.show()
    print("📈 Graph saved as 'workstyle_met_trend.png'.")
    
from phase3_analysis import phase3_analysis
from draw_phase3_graph import draw_phase3_graph

trend_df = phase3_analysis("phase2_flattened_with_md.xlsx")
draw_phase3_graph(trend_df)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def get_reporting_tree(manager_id, df_lookup):
    reports = set()
    for emp_id, record in df_lookup.items():
        if record.get('entity manager employee id') == manager_id:
            reports.add(emp_id)
            reports.update(get_reporting_tree(emp_id, df_lookup))
    return reports

def phase3_analysis(flattened_file_path):
    # Load flattened data
    df = pd.read_excel(flattened_file_path, sheet_name="flattened")
    df.columns = df.columns.str.strip().str.lower()

    # Normalize key columns
    df['employee id'] = df['employee id'].astype(str).str.strip()
    df['entity manager employee id'] = df['entity manager employee id'].astype(str).str.strip()
    df['md name'] = df['md name'].fillna('').str.strip()
    df['global career band'] = df['global career band'].fillna('').str.upper()

    df_lookup = df.set_index('employee id').to_dict('index')

    # 1️⃣ Hierarchy Summary: MD → GCB3 → full subtree count
    hierarchy_summary = []
    for md in df['md name'].dropna().unique():
        md_df = df[df['md name'] == md]
        gcb3_df = md_df[md_df['global career band'] == 'GCB 3']
        for _, gcb3 in gcb3_df.iterrows():
            gcb3_id = gcb3['employee id']
            gcb3_name = gcb3['employee name']
            full_subtree = get_reporting_tree(gcb3_id, df_lookup)
            hierarchy_summary.append({
                'MD Name': md,
                'GCB3 ID': gcb3_id,
                'GCB3 Name': gcb3_name,
                'Total Reports Under GCB3': len(full_subtree)
            })
    hierarchy_df = pd.DataFrame(hierarchy_summary)

    # 2️⃣ Monthly Headcount & Workstyle Met
    month_cols = [col for col in df.columns if 'workstyle met?' in col]
    trend_data = []
    for col in month_cols:
        month = col.split()[0]  # e.g. 'Apr-25'
        avg_col = f"{month} avg days at any office per week (with shrinkage)".lower().replace(' ', '_')
        if avg_col not in df.columns:
            continue
        headcount = df[avg_col].notna().sum()
        met_count = df[col].str.strip().str.lower().eq('yes').sum()
        pct = round((met_count / headcount) * 100, 2) if headcount else 0
        trend_data.append({
            'Month': month,
            'Headcount': headcount,
            'Workstyle Met': met_count,
            'Percentage': pct
        })
    trend_df = pd.DataFrame(trend_data)
    trend_df['Month_dt'] = pd.to_datetime(trend_df['Month'], format='%b-%y')
    trend_df = trend_df.sort_values('Month_dt')
    trend_df['MoM Change (%)'] = trend_df['Percentage'].diff().round(2)

    # 3️⃣ Plot Trend
    plt.figure(figsize=(10, 5))
    plt.plot(trend_df['Month_dt'], trend_df['Percentage'], marker='o', color='blue')
    plt.title('Workstyle Met % Trend Over Time')
    plt.xlabel('Month')
    plt.ylabel('% Workstyle Met')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("workstyle_met_trend.png")
    plt.show()

    # Save outputs
    with pd.ExcelWriter("phase3_trend_analysis.xlsx", engine='openpyxl') as writer:
        hierarchy_df.to_excel(writer, sheet_name="Hierarchy Summary", index=False)
        trend_df.drop(columns='Month_dt').to_excel(writer, sheet_name="Monthly Trend", index=False)

    print("✅ Phase 3 complete. Graph saved as 'workstyle_met_trend.png'.")
    
# phase3_analysis("phase2_flattened_with_md.xlsx", "Input/gha_sep15.xlsx")
phase3_analysis("phase2_flattened_with_md.xlsx")

In [None]:
# standalone function to find md in gha
import pandas as pd

def trace_md_from_gha(gha_file_path, sheet_name="Headcount Employee - Detail"):
    # Load GHA data
    df = pd.read_excel(gha_file_path, sheet_name=sheet_name)
    df.columns = df.columns.str.strip().str.lower()

    # Normalize IDs
    df['employee id'] = df['employee id'].astype(str).str.strip()
    df['entity manager employee id'] = df['entity manager employee id'].astype(str).str.strip()

    # Select required columns
    required_cols = [
        'employee id', 'employee name', 'global career band',
        'employee business email address', 'legal entity',
        'entity manager employee name', 'functional manager employee name',
        'employee category', 'bf level1 name', 'bf level2 name',
        'bf level3 name', 'bf level4 name', 'bf level5 name',
        'entity manager employee id'
    ]
    df = df[required_cols].copy()

    # Build lookup dictionary
    gha_lookup = df.set_index('employee id').to_dict('index')

    # Identify all MDs
    md_ids = set(df[df['global career band'].str.upper().str.strip() == 'MD']['employee id'])

    # Prepare output columns
    df['md id'] = ''
    df['md name'] = ''
    df['md found?'] = 'No'

    missing_records = []

    for idx, row in df.iterrows():
        current_mgr_id = row.get('entity manager employee id')
        visited = set()
        last_known_mgr_id = None
        last_known_mgr_name = None
        md_found = False

        while pd.notna(current_mgr_id) and current_mgr_id not in visited:
            visited.add(current_mgr_id)
            mgr_record = gha_lookup.get(current_mgr_id)

            if mgr_record is None:
                break

            last_known_mgr_id = current_mgr_id
            last_known_mgr_name = mgr_record.get('employee name', '')

            if current_mgr_id in md_ids:
                df.at[idx, 'md id'] = current_mgr_id
                df.at[idx, 'md name'] = last_known_mgr_name
                df.at[idx, 'md found?'] = 'Yes'
                md_found = True
                break

            current_mgr_id = mgr_record.get('entity manager employee id')

        if not md_found:
            missing_row = row.to_dict()
            missing_row.update({
                'last reachable manager id': last_known_mgr_id,
                'last reachable manager name': last_known_mgr_name,
                'missing reason': 'No MD found in chain'
            })
            missing_records.append(missing_row)

    # Return both DataFrames
    df_with_md = df[df['md found?'] == 'Yes'].copy()
    df_missing_md = pd.DataFrame(missing_records)

    return df_with_md, df_missing_md
df_md, df_missing = trace_md_from_gha("Input/gha_sep15.xlsx")

# Save to Excel
with pd.ExcelWriter("gha_md_trace.xlsx", engine='openpyxl') as writer:
    df_md.to_excel(writer, sheet_name="md found", index=False)
    df_missing.to_excel(writer, sheet_name="md missing", index=False)
    
def neutralize_column_names(df):
    def title_case(col):
        # Replace underscores with spaces, split into words, capitalize each
        return ' '.join(word.capitalize() for word in col.replace('_', ' ').split())

    df.columns = [title_case(col) for col in df.columns]
    return df

# After loading or transforming your DataFrame
df = neutralize_column_names(df)

# Then save or display
df.to_excel("cleaned_output.xlsx", index=False)

In [None]:
import pandas as pd

def trace_md_and_flag_missing(flattened_file_path, gha_file_path):
    # Load flattened data
    flat_df = pd.read_excel(flattened_file_path, sheet_name="flattened")
    flat_df.columns = flat_df.columns.str.strip().str.lower()

    # Load GHA data
    gha_df = pd.read_excel(gha_file_path, sheet_name="Headcount Employee - Detail")
    gha_df.columns = gha_df.columns.str.strip().str.lower()
    gha_df = gha_df.drop_duplicates(subset='employee id', keep='last')

    # Normalize IDs to string
    gha_df['employee id'] = gha_df['employee id'].astype(str).str.strip()
    gha_df['entity manager employee id'] = gha_df['entity manager employee id'].astype(str).str.strip()
    flat_df['employee id'] = flat_df['employee id'].astype(str).str.strip()
    flat_df['entity manager employee id'] = flat_df['entity manager employee id'].astype(str).str.strip()

    # Build lookup dictionary
    gha_lookup = gha_df.set_index('employee id').to_dict('index')

    # Identify all MDs
    md_df = gha_df[gha_df['global career band'].str.strip().str.upper() == 'MD']
    md_ids = set(md_df['employee id'])

    # Prepare output columns
    flat_df['md id'] = ''
    flat_df['md name'] = ''
    flat_df['md found?'] = 'No'

    missing_records = []

    for idx, row in flat_df.iterrows():
        current_mgr_id = row.get('entity manager employee id')
        visited = set()
        last_known_mgr_id = None
        last_known_mgr_name = None
        md_found = False

        while pd.notna(current_mgr_id) and current_mgr_id not in visited:
            visited.add(current_mgr_id)
            mgr_record = gha_lookup.get(current_mgr_id)

            if mgr_record is None:
                break

            last_known_mgr_id = current_mgr_id
            last_known_mgr_name = mgr_record.get('employee name', '')

            if current_mgr_id in md_ids:
                flat_df.at[idx, 'md id'] = current_mgr_id
                flat_df.at[idx, 'md name'] = last_known_mgr_name
                flat_df.at[idx, 'md found?'] = 'Yes'
                md_found = True
                break

            current_mgr_id = mgr_record.get('entity manager employee id')

        if not md_found:
            missing_row = row.to_dict()
            missing_row.update({
                'last reachable manager id': last_known_mgr_id,
                'last reachable manager name': last_known_mgr_name,
                'missing reason': 'No MD found in chain'
            })
            missing_records.append(missing_row)

    # Save both sheets
    with pd.ExcelWriter("phase2_flattened_with_md.xlsx", engine='openpyxl') as writer:
        flat_df.to_excel(writer, sheet_name="flattened", index=False)
        pd.DataFrame(missing_records).to_excel(writer, sheet_name="missing md chain", index=False)

    print(f"✅ MD tracing complete: {flat_df['md found?'].value_counts().to_dict()}")
trace_md_and_flag_missing("phase2_flattened.xlsx", "Input/gha_sep15.xlsx")

In [None]:
## Fine for MD finding
import pandas as pd

def append_md_info(flattened_file_path, gha_file_path):
    # Load flattened data
    flat_df = pd.read_excel(flattened_file_path, sheet_name="flattened")
    flat_df.columns = flat_df.columns.str.strip().str.lower()

    # Load GHA data
    gha_df = pd.read_excel(gha_file_path, sheet_name="Headcount Employee - Detail")
    gha_df.columns = gha_df.columns.str.strip().str.lower()
    gha_df = gha_df.drop_duplicates(subset='employee id', keep='last')

    # Build lookup dictionary
    gha_lookup = gha_df.set_index('employee id').to_dict('index')

    # Identify all MDs
    md_df = gha_df[gha_df['global career band'].str.strip().str.upper() == 'MD']
    md_ids = set(md_df['employee id'])

    # Prepare output columns
    flat_df['md id'] = ''
    flat_df['md name'] = ''
    flat_df['md found?'] = 'No'

    for idx, row in flat_df.iterrows():
        current_mgr_id = row.get('entity manager employee id')
        visited = set()

        while pd.notna(current_mgr_id) and current_mgr_id not in visited:
            visited.add(current_mgr_id)
            mgr_record = gha_lookup.get(current_mgr_id)

            if mgr_record is None:
                break

            if current_mgr_id in md_ids:
                flat_df.at[idx, 'md id'] = current_mgr_id
                flat_df.at[idx, 'md name'] = mgr_record.get('employee name', '')
                flat_df.at[idx, 'md found?'] = 'Yes'
                break

            current_mgr_id = mgr_record.get('entity manager employee id')

    # Save updated file
    flat_df.to_excel("phase2_flattened_with_md.xlsx", sheet_name="flattened", index=False)
    print(f"✅ MD tracing complete: {flat_df['md found?'].value_counts().to_dict()}")
    
append_md_info("phase2_flattened.xlsx", "Input/gha_sep15.xlsx")

In [None]:
# perfect
import pandas as pd

def flatten_monthly_data(enriched_file_path):
    # Load enriched data
    df = pd.read_excel(enriched_file_path, sheet_name="enriched_data")
    df.columns = df.columns.str.strip().str.lower()

    # Normalize month column
    df['month'] = df['month'].str.strip().str.title()  # e.g. 'Feb-25'
    df['month_dt'] = pd.to_datetime(df['month'], format='%b-%y')

    # Static columns
    static_cols = [
        'employee id', 'employee name', 'entity manager employee id',
        'entity manager employee name', 'global career band'
    ]

    # Dynamic metrics
    dynamic_metrics = [
        'avg days at any office per week (with shrinkage)',
        'workstyle met?'
    ]

    # Pivot
    pivot_df = df.pivot_table(
        index=static_cols,
        columns='month',
        values=dynamic_metrics,
        aggfunc='first'
    )

    # Sort months chronologically
    sorted_months = (
        df[['month', 'month_dt']]
        .drop_duplicates()
        .sort_values('month_dt')['month']
        .tolist()
    )

    # Build ordered column list: [Feb-25 avg, Feb-25 workstyle, Mar-25 avg, …]
    ordered_cols = []
    for month in sorted_months:
        for metric in dynamic_metrics:
            col_key = (metric.lower(), month)
            if col_key in pivot_df.columns:
                ordered_cols.append(col_key)

    # Reindex and flatten
    pivot_df = pivot_df[ordered_cols]
    pivot_df.columns = [
        f"{month} {metric.replace(' ', '_')}"
        for metric, month in pivot_df.columns
    ]
    pivot_df.reset_index(inplace=True)

    # Save to Excel
    pivot_df.to_excel("phase2_flattened.xlsx", sheet_name="flattened", index=False)
    print(f"✅ Flattened file created with grouped monthly columns.")

In [None]:
import pandas as pd

def build_hierarchy_to_muthu(enriched_file_path, gha_file_path):
    # Load enriched data
    enriched_df = pd.read_excel(enriched_file_path, sheet_name="enriched_data")
    enriched_df.columns = enriched_df.columns.str.strip().str.lower()

    # Load GHA data
    gha_df = pd.read_excel(gha_file_path, sheet_name="Headcount Employee - Detail")
    gha_df.columns = gha_df.columns.str.strip().str.lower()

    # Deduplicate GHA
    gha_df = gha_df.drop_duplicates(subset='employee id', keep='last')

    # Build lookup dictionary and valid ID set
    gha_lookup = gha_df.set_index('employee id').to_dict('index')
    valid_mgr_ids = set(gha_df['employee id'])

    # Identify Muthu's employee ID
    muthu_df = gha_df[
        (gha_df['global career band'].str.strip().str.upper() == 'MD') &
        (gha_df['employee name'].str.strip().str.lower() == 'muthu')
    ]
    if muthu_df.empty:
        print("❗ Muthu not found in GHA file.")
        return
    muthu_id = muthu_df['employee id'].iloc[0]
    muthu_name = muthu_df['employee name'].iloc[0]

    hierarchy_records = []
    missing_chain_records = []

    for _, row in enriched_df.iterrows():
        emp_id = row['employee id']
        current_mgr_id = row.get('entity manager employee id')
        visited = set()
        reached_muthu = False
        missing_mgr_id = None
        missing_mgr_name = None

        # Capture immediate manager info from enriched row
        immediate_mgr_id = current_mgr_id
        immediate_mgr_name = row.get('entity manager employee name')
        immediate_mgr_gcb = gha_lookup.get(immediate_mgr_id, {}).get('global career band', '')

        while pd.notna(current_mgr_id) and current_mgr_id not in visited:
            visited.add(current_mgr_id)

            if current_mgr_id not in valid_mgr_ids:
                # First missing manager in chain
                missing_mgr_id = current_mgr_id
                missing_mgr_name = (
                    immediate_mgr_name if current_mgr_id == immediate_mgr_id else ''
                )
                break

            if current_mgr_id == muthu_id:
                reached_muthu = True
                break

            current_mgr_id = gha_lookup[current_mgr_id].get('entity manager employee id')

        if reached_muthu:
            enriched_row = row.to_dict()
            enriched_row.update({
                'md id': muthu_id,
                'md name': muthu_name,
                'entity manager employee id': immediate_mgr_id,
                'entity manager employee name': immediate_mgr_name,
                'entity manager gcb': immediate_mgr_gcb
            })
            hierarchy_records.append(enriched_row)
        elif missing_mgr_id:
            missing_row = row.to_dict()
            missing_row.update({
                'missing manager id': missing_mgr_id,
                'missing manager name': missing_mgr_name,
                'missing reason': 'Manager not found in GHA'
            })
            missing_chain_records.append(missing_row)

    # Save to Excel
    hierarchy_df = pd.DataFrame(hierarchy_records)
    missing_df = pd.DataFrame(missing_chain_records)

    with pd.ExcelWriter("phase2_hierarchy.xlsx", engine='openpyxl') as writer:
        hierarchy_df.to_excel(writer, sheet_name="hierarchy report", index=False)
        missing_df.to_excel(writer, sheet_name="missing mgr in gha", index=False)

    print("✅ Phase 2 complete: 'phase2_hierarchy.xlsx' created.")

In [None]:
import pandas as pd

def enrich_monthly_data(monthly_csv_path, gha_excel_path):
    # Load monthly tracker CSV
    monthly_df = pd.read_csv(monthly_csv_path)
    
    # Load GHA master Excel sheet
    gha_df = pd.read_excel(gha_excel_path, sheet_name="Headcount Employee - Detail")

    # Standardize column names
    monthly_df.columns = monthly_df.columns.str.strip().str.lower()
    gha_df.columns = gha_df.columns.str.strip().str.lower()

    # Deduplicate GHA data by Employee ID (keep last occurrence)
    gha_df = gha_df.drop_duplicates(subset='employee id', keep='last')

    # Define columns to bring from GHA
    gha_columns = [
        'employee id', 'global career band',
        'bf level 1 name', 'bf level 2 name', 'bf level 3 name',
        'bf level 4 name', 'bf level 5 name',
        'legal entity name', 'entity manager employee id', 'entity manager employee name'
    ]

    # Filter GHA to required columns
    gha_filtered = gha_df[gha_columns]

    # Merge monthly data with GHA data
    enriched_df = monthly_df.merge(
        gha_filtered,
        on='employee id',
        how='left',
        suffixes=('', '_gha')
    )

    # Identify rows where enrichment failed (missing GHA info)
    missing_mask = enriched_df['global career band'].isna()
    missing_df = enriched_df[missing_mask].copy()
    enriched_final_df = enriched_df.copy()  # Keep all rows, enriched where possible

    # Save to Excel with two sheets
    with pd.ExcelWriter("phase1_enriched.xlsx", engine='openpyxl') as writer:
        enriched_final_df.to_excel(writer, sheet_name="enriched_data", index=False)
        missing_df.to_excel(writer, sheet_name="missing in gha", index=False)

    print("✅ Phase 1 complete: 'phase1_enriched.xlsx' created with enriched and missing sheets.")
enrich_monthly_data("Input/monthlysep.csv", "Input/gha_sep15.xlsx")

In [None]:
import pandas as pd

def enrich_employee_data(monthly_csv_path, gha_file_path):
    # Load monthly employee data from CSV
    monthly_df = pd.read_csv(monthly_csv_path)
    gha_df = pd.read_excel(gha_file_path, sheet_name="Headcount – Employee Detail")

    # Standardize column names
    monthly_df.columns = monthly_df.columns.str.strip().str.lower()
    gha_df.columns = gha_df.columns.str.strip().str.lower()

    # Deduplicate GHA data by Employee ID (keep last occurrence)
    gha_df = gha_df.drop_duplicates(subset='employee id', keep='last')

    # Merge GHA info into monthly data
    enriched_df = monthly_df.merge(
        gha_df[['employee id', 'employee name', 'entity manager employee id', 'global career band', 'bf levels', 'email']],
        on='employee id',
        how='left',
        suffixes=('', '_gha')
    )

    # Identify missing employee info
    missing_employee_df = enriched_df[enriched_df['global career band'].isna()].copy()
    missing_employee_df['missing_reason'] = 'Employee not found in GHA'

    # Check for missing manager info using GHA lookup
    gha_ids = set(gha_df['employee id'])
    enriched_df['manager_missing'] = ~enriched_df['entity manager employee id'].isin(gha_ids)
    missing_manager_df = enriched_df[enriched_df['manager_missing']].copy()
    missing_manager_df['missing_reason'] = 'Manager not found in GHA'

    # Combine all missing info
    missing_combined_df = pd.concat([missing_employee_df, missing_manager_df], ignore_index=True)

    # Save outputs
    with pd.ExcelWriter("phase1_enriched.xlsx", engine='openpyxl') as writer:
        enriched_df.to_excel(writer, sheet_name="Enriched", index=False)
        missing_combined_df.to_excel(writer, sheet_name="Missing Info", index=False)

In [None]:
import pandas as pd

def generate_phase2_summary_df(monthly_df, hierarchy_df, missing_df):
    summary_data = []

    # 1. Total Employees in Monthly.csv
    total_headcount = len(monthly_df)
    summary_data.append({"Metric": "Total Headcount", "Value": total_headcount})

    # 2. Missing records in GHA
    missing_count = len(missing_df)
    summary_data.append({
        "Metric": "Total Records with Missing Information",
        "Value": missing_count
    })

    # 3. Total in Hierarchy sheet
    total_available = len(hierarchy_df)
    summary_data.append({"Metric": "Total Available Headcount", "Value": total_available})

    # 4–8. Total per BF Level 1–5
    for level in range(1, 6):
        col_name = f"BF Level {level}"
        if col_name in hierarchy_df.columns:
            counts = hierarchy_df[col_name].value_counts()
            for k, v in counts.items():
                summary_data.append({
                    "Metric": f"Total Headcount for {col_name} = {k}",
                    "Value": v
                })

    # 9. Finance except Poland
    if "Work Location Country/Territory Name" in hierarchy_df.columns:
        poland_count = (hierarchy_df["Work Location Country/Territory Name"] == "Poland").sum()
        finance_except_poland = total_available - poland_count
        summary_data.append({
            "Metric": "Total Headcount for Finance except Poland",
            "Value": finance_except_poland
        })

    # 10. MD count
    if "Global Career Band" in hierarchy_df.columns:
        md_count = (hierarchy_df["Global Career Band"] == "MD").sum()
        summary_data.append({"Metric": "Total Number of MDs", "Value": md_count})

        # 11. GCB 3 count
        gcb3_count = (hierarchy_df["Global Career Band"] == "3").sum()
        summary_data.append({"Metric": "Total Number of GCB 3s", "Value": gcb3_count})

    # Final DF
    summary_df = pd.DataFrame(summary_data)
    return summary_df



summary_df = generate_phase2_summary_df(monthly_df, hierarchy_df, missing_df)
with pd.ExcelWriter("phase2_output.xlsx", engine="openpyxl") as writer:
    hierarchy_df.to_excel(writer, sheet_name="Hierarchy Report", index=False)
    missing_df.to_excel(writer, sheet_name="Missing Managers", index=False)
    summary_df.to_excel(writer, sheet_name="Summary", index=False)


In [None]:
import pandas as pd
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import os

# -----------------------------
# Recursive slicing function
# -----------------------------
def get_hierarchy_slice(df, manager_id):
    """Return all employees (direct/indirect) under manager_id in order.
       Manager's own record is NOT included.
    """
    result = []
    visited = set()

    mid_str = str(manager_id).strip()

    def recurse(mid):
        reports = df[df['Entity Manager Employee ID'].astype(str).str.strip() == str(mid).strip()]
        for _, r in reports.iterrows():
            eid = str(r['Employee ID']).strip()
            if eid not in visited:
                visited.add(eid)
                result.append(r.to_dict())
                recurse(eid)

    recurse(mid_str)
    return pd.DataFrame(result)


# -----------------------------
# GUI Application
# -----------------------------
class Phase3GUI:
    def __init__(self, master):
        self.master = master
        master.title("Phase 3: Hierarchy Slice Utility")
        master.geometry("700x400")

        self.df = None
        self.name_to_ids = {}
        self.output_folder = ""

        # ---------------- Input File ----------------
        tk.Label(master, text="Hierarchy Report File:").grid(row=0, column=0, sticky="w", padx=10, pady=5)
        self.input_file_var = tk.StringVar()
        tk.Entry(master, textvariable=self.input_file_var, width=50).grid(row=0, column=1, padx=5)
        tk.Button(master, text="Browse", command=self.browse_input_file).grid(row=0, column=2, padx=5)

        # ---------------- Output Folder ----------------
        tk.Label(master, text="Output Folder:").grid(row=1, column=0, sticky="w", padx=10, pady=5)
        self.output_folder_var = tk.StringVar()
        tk.Entry(master, textvariable=self.output_folder_var, width=50).grid(row=1, column=1, padx=5)
        tk.Button(master, text="Browse", command=self.browse_output_folder).grid(row=1, column=2, padx=5)

        # ---------------- Manager Selection ----------------
        tk.Label(master, text="Select Manager Name:").grid(row=2, column=0, sticky="w", padx=10, pady=5)
        self.manager_cb = ttk.Combobox(master, width=47, state="readonly")
        self.manager_cb.grid(row=2, column=1, padx=5)
        tk.Button(master, text="Load Managers", command=self.load_managers).grid(row=2, column=2, padx=5)

        # ---------------- GCB Level Selection ----------------
        tk.Label(master, text="Select GCB Level (for Generate All):").grid(row=3, column=0, sticky="w", padx=10, pady=5)
        self.gcb_level_cb = ttk.Combobox(master, values=[3,4,5,6,7,8], width=10, state="readonly")
        self.gcb_level_cb.grid(row=3, column=1, sticky="w", padx=5)

        # ---------------- Buttons ----------------
        tk.Button(master, text="Generate Slice for Selected Manager", command=self.slice_selected_manager).grid(row=4, column=0, columnspan=2, pady=10)
        tk.Button(master, text="Generate Slices for All at GCB Level", command=self.slice_all_at_level).grid(row=5, column=0, columnspan=2, pady=10)

        # ---------------- Status ----------------
        self.status_var = tk.StringVar()
        tk.Label(master, textvariable=self.status_var, fg="blue").grid(row=6, column=0, columnspan=3, pady=10)


    # ---------------- Browse Functions ----------------
    def browse_input_file(self):
        filename = filedialog.askopenfilename(filetypes=[("Excel files","*.xlsx")])
        if filename:
            self.input_file_var.set(filename)

    def browse_output_folder(self):
        folder = filedialog.askdirectory()
        if folder:
            self.output_folder_var.set(folder)
            self.output_folder = folder

    # ---------------- Load Manager Names ----------------
    def load_managers(self):
        file = self.input_file_var.get()
        if not file or not os.path.exists(file):
            messagebox.showerror("Error", "Please select a valid Hierarchy Report file")
            return

        self.df = pd.read_excel(file, sheet_name="Hierarchy Report")
        self.df.columns = self.df.columns.str.strip()

        # Normalize ID columns
        if 'Employee ID' in self.df.columns:
            self.df['Employee ID'] = self.df['Employee ID'].astype(str).str.strip()
        if 'Entity Manager Employee ID' in self.df.columns:
            self.df['Entity Manager Employee ID'] = self.df['Entity Manager Employee ID'].astype(str).str.strip()
        if 'Global Career Band' in self.df.columns:
            self.df['Global Career Band'] = self.df['Global Career Band'].astype(str).str.strip()

        # ✅ Only names from "Manager Name" column
        managers = sorted(self.df['Manager Name'].dropna().unique().tolist())
        self.manager_cb['values'] = managers

        # Create name→IDs map (still needed internally)
        self.name_to_ids = self.df.groupby('Employee Name')['Employee ID'].apply(lambda s: list(dict.fromkeys(s))).to_dict()
        self.status_var.set(f"Loaded {len(managers)} managers.")


    # ---------------- Slice Selected Manager ----------------
    def slice_selected_manager(self):
        name = self.manager_cb.get()
        out_folder = self.output_folder_var.get()
        if not name:
            messagebox.showerror("Error", "Please select a manager name")
            return
        if not out_folder:
            messagebox.showerror("Error", "Please select output folder")
            return

        ids = self.name_to_ids.get(name, [])
        if len(ids) > 1:
            id_select_win = tk.Toplevel(self.master)
            id_select_win.title(f"Select Manager ID for {name}")
            tk.Label(id_select_win, text=f"Select Manager ID for {name}:").pack(pady=5)
            selected_id_var = tk.StringVar()
            cb = ttk.Combobox(id_select_win, values=ids, textvariable=selected_id_var, state="readonly")
            cb.pack(pady=5)
            def confirm_id():
                manager_id = selected_id_var.get()
                if manager_id:
                    self.generate_slice(manager_id, name, out_folder)
                    id_select_win.destroy()
            tk.Button(id_select_win, text="Confirm", command=confirm_id).pack(pady=5)
            return
        elif len(ids) == 0:
            messagebox.showerror("Error", f"No manager ID found for {name}")
            return
        else:
            manager_id = ids[0]
            self.generate_slice(manager_id, name, out_folder)

    # ---------------- Slice All at GCB Level ----------------
    def slice_all_at_level(self):
        level = self.gcb_level_cb.get()
        out_folder = self.output_folder_var.get()
        if not level:
            messagebox.showerror("Error", "Please select a GCB level")
            return
        if not out_folder:
            messagebox.showerror("Error", "Please select output folder")
            return
        level = int(level)
        gcb_numeric = pd.to_numeric(self.df['Global Career Band'], errors='coerce')
        managers = self.df[gcb_numeric == level][['Employee ID','Employee Name']].drop_duplicates()

        saved = 0
        for _, row in managers.iterrows():
            mid = str(row['Employee ID'])
            mname = row['Employee Name']
            sliced_df = get_hierarchy_slice(self.df, mid)
            if sliced_df.empty:
                continue
            summary = self.build_summary(sliced_df, mname, mid, level)

            file_path = os.path.join(out_folder, f"{mname}_{mid}_slice.xlsx")
            with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
                sliced_df.to_excel(writer, index=False, sheet_name="Hierarchy Slice")
                pd.DataFrame(summary).to_excel(writer, index=False, sheet_name="Summary")
            saved += 1

        messagebox.showinfo("Done", f"All slices for GCB Level {level} saved in:\n{out_folder}\nFiles created: {saved}")

    # ---------------- Generate Slice (single) ----------------
    def generate_slice(self, manager_id, manager_name, out_folder):
        sliced_df = get_hierarchy_slice(self.df, manager_id)
        if sliced_df.empty:
            messagebox.showinfo("Info", f"No records found under {manager_name}")
            return

        gcb_level = self.gcb_level_cb.get()
        gcb_level = int(gcb_level) if gcb_level else None
        summary = self.build_summary(sliced_df, manager_name, manager_id, gcb_level)

        safe_name = manager_name.replace(" ", "_")
        file_path = os.path.join(out_folder, f"{safe_name}_{manager_id}_slice.xlsx")
        with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
            sliced_df.to_excel(writer, index=False, sheet_name="Hierarchy Slice")
            pd.DataFrame(summary).to_excel(writer, index=False, sheet_name="Summary")

        self.status_var.set(f"Slice for {manager_name} saved → {file_path}")
        messagebox.showinfo("Success", f"Slice for {manager_name} saved successfully!")

    # ---------------- Summary helper ----------------
    def build_summary(self, sliced_df, manager_name, manager_id, gcb_level):
        gcb_numeric = pd.to_numeric(sliced_df['Global Career Band'], errors='coerce')

        total_emp = sliced_df['Employee ID'].nunique()
        total_gcb3 = sliced_df[gcb_numeric == 3]['Employee ID'].nunique()
        total_gcb4 = sliced_df[gcb_numeric == 4]['Employee ID'].nunique()

        summary = [
            {"Metric": "Manager Name", "Value": manager_name},
            {"Metric": "Manager ID", "Value": manager_id},
            {"Metric": "Total Employees (direct+indirect)", "Value": total_emp},
        ]

        # ✅ Logic fix: show only relevant counts
        if gcb_level == 3:
            summary.append({"Metric": "Total GCB4 reporting", "Value": total_gcb4})
        elif gcb_level == 4:
            summary.append({"Metric": "Direct Employees under GCB4", "Value": total_emp})

        return summary


# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    root = tk.Tk()
    app = Phase3GUI(root)
    root.mainloop()


In [None]:
import os
import pandas as pd
import tkinter as tk
from tkinter import filedialog, messagebox, ttk


# -------------------------
# Helper functions
# -------------------------

def get_all_reports(df, manager_name):
    """Get all reports under a manager recursively"""
    reports = df[df["Manager Name"] == manager_name].copy()
    all_reports = reports.copy()

    for _, row in reports.iterrows():
        sub_reports = get_all_reports(df, row["Entity Manager Employee name"])
        all_reports = pd.concat([all_reports, sub_reports], ignore_index=True)

    return all_reports


def generate_summary(sliced_df, manager_name, gcb_level):
    """Generate summary dataframe (dummy example)"""
    summary = (
        sliced_df.groupby("Global Career Band")
        .size()
        .reset_index(name="Count")
    )
    summary["Manager"] = manager_name
    summary["Level"] = gcb_level
    return summary


def generate_slice(input_file, output_dir, manager_name, gcb_level, suppress_popup=False):
    df = pd.read_excel(input_file, sheet_name="Hierarchy Report")

    # Managers list
    managers = df["Entity Manager Employee name"].dropna().unique().tolist()
    if manager_name not in managers:
        if not suppress_popup:
            messagebox.showerror("Error", f"{manager_name} is not a valid manager.")
        return

    # Get reports
    sliced_df = get_all_reports(df, manager_name).copy()
    print(f"{manager_name} → {len(sliced_df)} rows")  # debug

    # Remove the manager’s own record
    sliced_df = sliced_df[sliced_df["Entity Manager Employee name"] != manager_name]

    # Summary
    summary_df = generate_summary(sliced_df, manager_name, gcb_level)

    # Save
    out_file = os.path.join(output_dir, f"{manager_name}_slice.xlsx")
    with pd.ExcelWriter(out_file, engine="openpyxl") as writer:
        sliced_df.to_excel(writer, sheet_name="Slice", index=False)
        summary_df.to_excel(writer, sheet_name="Summary", index=False)

    if not suppress_popup:
        messagebox.showinfo("Success", f"Report saved to {out_file}")


def generate_all(input_file, output_dir, gcb_level):
    df = pd.read_excel(input_file, sheet_name="Hierarchy Report")

    managers = df.loc[
        df["Global Career Band"] == gcb_level, "Entity Manager Employee name"
    ].dropna().unique().tolist()

    if not managers:
        messagebox.showwarning("Warning", f"No managers found for {gcb_level}")
        return

    for m in managers:
        generate_slice(input_file, output_dir, m, gcb_level, suppress_popup=True)

    messagebox.showinfo("Success", f"All {gcb_level} slices saved to {output_dir}")


# -------------------------
# GUI
# -------------------------

class ReportGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("Phase 3 Hierarchy Slicer")

        # Vars
        self.input_file = tk.StringVar()
        self.output_dir = tk.StringVar()
        self.manager_name = tk.StringVar()
        self.gcb_level = tk.StringVar()

        # Layout
        tk.Label(root, text="Input Excel File:").grid(row=0, column=0, sticky="w")
        tk.Entry(root, textvariable=self.input_file, width=50).grid(row=0, column=1)
        tk.Button(root, text="Browse", command=self.browse_input).grid(row=0, column=2)

        tk.Label(root, text="Output Folder:").grid(row=1, column=0, sticky="w")
        tk.Entry(root, textvariable=self.output_dir, width=50).grid(row=1, column=1)
        tk.Button(root, text="Browse", command=self.browse_output).grid(row=1, column=2)

        tk.Label(root, text="Manager Name:").grid(row=2, column=0, sticky="w")
        self.manager_combo = ttk.Combobox(root, textvariable=self.manager_name, width=47)
        self.manager_combo.grid(row=2, column=1, columnspan=2)

        tk.Label(root, text="GCB Level:").grid(row=3, column=0, sticky="w")
        self.gcb_combo = ttk.Combobox(
            root,
            textvariable=self.gcb_level,
            values=["MD", "GCB3", "GCB4", "GCB5", "GCB6", "GCB7", "GCB8"],
            width=20,
        )
        self.gcb_combo.grid(row=3, column=1, sticky="w")

        # Buttons
        tk.Button(
            root, text="Generate Slice (Selected Manager)", command=self.run_slice
        ).grid(row=4, column=0, pady=10)

        tk.Button(
            root, text="Generate All (Selected GCB Level)", command=self.run_all
        ).grid(row=4, column=1, pady=10)

    def browse_input(self):
        file = filedialog.askopenfilename(
            filetypes=[("Excel Files", "*.xlsx *.xls")])
        if file:
            self.input_file.set(file)
            # Load managers into combobox
            try:
                df = pd.read_excel(file, sheet_name="Hierarchy Report")
                managers = df["Entity Manager Employee name"].dropna().unique().tolist()
                self.manager_combo["values"] = sorted(managers)
            except Exception as e:
                messagebox.showerror("Error", f"Could not load managers: {e}")

    def browse_output(self):
        folder = filedialog.askdirectory()
        if folder:
            self.output_dir.set(folder)

    def run_slice(self):
        if not self.input_file.get() or not self.output_dir.get():
            messagebox.showerror("Error", "Please select input file and output folder")
            return
        if not self.manager_name.get():
            messagebox.showerror("Error", "Please select a manager name")
            return
        if not self.gcb_level.get():
            messagebox.showerror("Error", "Please select a GCB level")
            return

        generate_slice(
            self.input_file.get(),
            self.output_dir.get(),
            self.manager_name.get(),
            self.gcb_level.get(),
        )

    def run_all(self):
        if not self.input_file.get() or not self.output_dir.get():
            messagebox.showerror("Error", "Please select input file and output folder")
            return
        if not self.gcb_level.get():
            messagebox.showerror("Error", "Please select a GCB level")
            return

        generate_all(
            self.input_file.get(),
            self.output_dir.get(),
            self.gcb_level.get(),
        )


# -------------------------
# Run
# -------------------------

if __name__ == "__main__":
    root = tk.Tk()
    app = ReportGUI(root)
    root.mainloop()


In [None]:
import pandas as pd
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import os

# -----------------------------
# Recursive slicing function
# -----------------------------
def get_hierarchy_slice(df, manager_id):
    """Return all employees (direct/indirect) under manager_id in order."""
    result = []

    def recurse(mid):
        reports = df[df['Entity Manager Employee ID'] == mid]
        for _, r in reports.iterrows():
            result.append(r)
            recurse(r['Employee ID'])
    
    # Add manager row itself
    manager_row = df[df['Employee ID'] == manager_id]
    if not manager_row.empty:
        result.append(manager_row.iloc[0])
        recurse(manager_id)
    return pd.DataFrame(result)

# -----------------------------
# GUI Application
# -----------------------------
class Phase3GUI:
    def __init__(self, master):
        self.master = master
        master.title("Phase 3: Hierarchy Slice Utility")
        master.geometry("700x400")

        # ---------------- Input File ----------------
        tk.Label(master, text="Hierarchy Report File:").grid(row=0, column=0, sticky="w", padx=10, pady=5)
        self.input_file_var = tk.StringVar()
        tk.Entry(master, textvariable=self.input_file_var, width=50).grid(row=0, column=1, padx=5)
        tk.Button(master, text="Browse", command=self.browse_input_file).grid(row=0, column=2, padx=5)

        # ---------------- Output Folder ----------------
        tk.Label(master, text="Output Folder:").grid(row=1, column=0, sticky="w", padx=10, pady=5)
        self.output_folder_var = tk.StringVar()
        tk.Entry(master, textvariable=self.output_folder_var, width=50).grid(row=1, column=1, padx=5)
        tk.Button(master, text="Browse", command=self.browse_output_folder).grid(row=1, column=2, padx=5)

        # ---------------- Manager Selection ----------------
        tk.Label(master, text="Select Manager Name:").grid(row=2, column=0, sticky="w", padx=10, pady=5)
        self.manager_cb = ttk.Combobox(master, width=47, state="readonly")
        self.manager_cb.grid(row=2, column=1, padx=5)
        tk.Button(master, text="Load Managers", command=self.load_managers).grid(row=2, column=2, padx=5)

        # ---------------- GCB Level Selection ----------------
        tk.Label(master, text="Select GCB Level (for Generate All):").grid(row=3, column=0, sticky="w", padx=10, pady=5)
        self.gcb_level_cb = ttk.Combobox(master, values=[3,4,5,6,7,8], width=10, state="readonly")
        self.gcb_level_cb.grid(row=3, column=1, sticky="w", padx=5)

        # ---------------- Buttons ----------------
        tk.Button(master, text="Generate Slice for Selected Manager", command=self.slice_selected_manager).grid(row=4, column=0, columnspan=2, pady=10)
        tk.Button(master, text="Generate Slices for All at GCB Level", command=self.slice_all_at_level).grid(row=5, column=0, columnspan=2, pady=10)

        # ---------------- Status ----------------
        self.status_var = tk.StringVar()
        tk.Label(master, textvariable=self.status_var, fg="blue").grid(row=6, column=0, columnspan=3, pady=10)

    # ---------------- Browse Functions ----------------
    def browse_input_file(self):
        filename = filedialog.askopenfilename(filetypes=[("Excel files","*.xlsx")])
        if filename:
            self.input_file_var.set(filename)

    def browse_output_folder(self):
        folder = filedialog.askdirectory()
        if folder:
            self.output_folder_var.set(folder)

    # ---------------- Load Manager Names ----------------
    def load_managers(self):
        file = self.input_file_var.get()
        if not file or not os.path.exists(file):
            messagebox.showerror("Error", "Please select a valid Hierarchy Report file")
            return
        self.df = pd.read_excel(file, sheet_name="Hierarchy Report")
        # Create a map: Name -> list of IDs
        self.name_to_ids = self.df.groupby('Employee Name')['Employee ID'].apply(list).to_dict()
        self.manager_cb['values'] = list(self.name_to_ids.keys())
        self.status_var.set(f"Loaded {len(self.name_to_ids)} manager names.")

    # ---------------- Slice Selected Manager ----------------
    def slice_selected_manager(self):
        name = self.manager_cb.get()
        out_folder = self.output_folder_var.get()
        if not name:
            messagebox.showerror("Error", "Please select a manager name")
            return
        if not out_folder:
            messagebox.showerror("Error", "Please select output folder")
            return

        ids = self.name_to_ids.get(name, [])
        if len(ids) > 1:
            # If multiple IDs for same name, ask user to select one
            id_select_win = tk.Toplevel(self.master)
            id_select_win.title(f"Select Manager ID for {name}")
            tk.Label(id_select_win, text=f"Select Manager ID for {name}:").pack(pady=5)
            selected_id_var = tk.StringVar()
            cb = ttk.Combobox(id_select_win, values=ids, textvariable=selected_id_var, state="readonly")
            cb.pack(pady=5)
            def confirm_id():
                manager_id = selected_id_var.get()
                if manager_id:
                    self.generate_slice(manager_id, name, out_folder)
                    id_select_win.destroy()
            tk.Button(id_select_win, text="Confirm", command=confirm_id).pack(pady=5)
            return
        elif len(ids) == 0:
            messagebox.showerror("Error", f"No manager ID found for {name}")
            return
        else:
            manager_id = ids[0]
            self.generate_slice(manager_id, name, out_folder)

    # ---------------- Slice All at GCB Level ----------------
    def slice_all_at_level(self):
        level = self.gcb_level_cb.get()
        out_folder = self.output_folder_var.get()
        if not level:
            messagebox.showerror("Error", "Please select a GCB level")
            return
        if not out_folder:
            messagebox.showerror("Error", "Please select output folder")
            return
        level = int(level)
        managers = self.df[self.df['Global Career Band']==level][['Employee ID','Employee Name']].drop_duplicates()
        for _, row in managers.iterrows():
            self.generate_slice(row['Employee ID'], row['Employee Name'], out_folder)

    # ---------------- Generate Slice ----------------
    def generate_slice(self, manager_id, manager_name, out_folder):
        sliced_df = get_hierarchy_slice(self.df, manager_id)
        if sliced_df.empty:
            messagebox.showinfo("Info", f"No records found under {manager_name}")
            return
        # Summary sheet
        total_emp = len(sliced_df)
        total_gcb3 = len(sliced_df[sliced_df['Global Career Band']==3])
        total_gcb4 = len(sliced_df[sliced_df['Global Career Band']==4])
        per_gcb3 = sliced_df[sliced_df['Global Career Band']==3].groupby('Employee Name')['Employee ID'].count().reset_index(name='Count')
        per_gcb4 = sliced_df[sliced_df['Global Career Band']==4].groupby('Employee Name')['Employee ID'].count().reset_index(name='Count')

        # Save Excel
        file_path = os.path.join(out_folder, f"{manager_name}_slice.xlsx")
        with pd.ExcelWriter(file_path, engine="openpyxl") as writer:
            sliced_df.to_excel(writer, index=False, sheet_name="Hierarchy Slice")
            # Summary stacked
            start_row = 0
            summary = pd.DataFrame([["Manager Name", manager_name], ["Manager ID", manager_id], ["Total Employees", total_emp]])
            summary.to_excel(writer, index=False, header=False, sheet_name="Summary", startrow=start_row)
            start_row += len(summary) + 1
            pd.DataFrame([["Total GCB3", total_gcb3], ["Total GCB4", total_gcb4]]).to_excel(writer, index=False, header=False, sheet_name="Summary", startrow=start_row)
            start_row += 3
            per_gcb3.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(per_gcb3) + 2
            per_gcb4.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
        self.status_var.set(f"Slice for {manager_name} saved → {file_path}")
        messagebox.showinfo("Success", f"Slice for {manager_name} saved successfully!")

# -----------------------------
# Main
# -----------------------------
if __name__ == "__main__":
    root = tk.Tk()
    app = Phase3GUI(root)
    root.mainloop()


In [None]:
# -------------------------
# Phase 2: Build Hierarchy (fixed missing-manager name resolution)
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    # Normalize ID/name columns as strings for robust matching
    enriched = enriched.copy()
    if EMP_ID_COL in enriched.columns:
        enriched[EMP_ID_COL] = enriched[EMP_ID_COL].astype(str).str.strip()
    if MGR_ID_COL in enriched.columns:
        enriched[MGR_ID_COL] = enriched[MGR_ID_COL].astype(str).str.strip()
    if EMP_NAME_COL in enriched.columns:
        enriched[EMP_NAME_COL] = enriched[EMP_NAME_COL].astype(str).str.strip()
    if MGR_NAME_COL in enriched.columns:
        enriched[MGR_NAME_COL] = enriched[MGR_NAME_COL].astype(str).str.strip()
    if GCB_COL in enriched.columns:
        enriched[GCB_COL] = enriched[GCB_COL].astype(str).str.strip().str.upper()

    # Build a DataFrame lookup and a deduplicated map for safe access
    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    # helper to safely get a single row (Series) for an employee id (handles duplicates)
    def get_emp_row(emp_id):
        """Return a Series for emp_id if present, else None. Handles duplicate index by returning first row."""
        try:
            row = df_lookup.loc[emp_id]
        except KeyError:
            return None
        # if multiple rows returned (DataFrame), take first
        if isinstance(row, pd.DataFrame):
            return row.iloc[0]
        return row

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager."""
        if pd.isna(manager_id) or manager_id == "":
            return
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row.get(MGR_NAME_COL, ""),
                "Manager GCB": get_emp_row(manager_id)[GCB_COL]
                if get_emp_row(manager_id) is not None else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": None,
            "Manager Name": None,
            "Manager GCB": None,
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # ------------------------------
    # Exceptions: improved missing-manager tracing + proper missing manager name lookup
    # ------------------------------
    # build set of all known employee ids (strings)
    all_emp_ids = set(enriched[EMP_ID_COL].astype(str).tolist())

    # helper to resolve missing manager name for a missing manager id:
    def resolve_missing_manager_name(missing_mgr_id):
        """
        Try multiple ways to find a name for missing_mgr_id:
         1) If missing_mgr_id appears as an Employee ID (unexpected here) use employee name.
         2) Else, look for rows where MGR_ID_COL == missing_mgr_id and extract the MGR_NAME_COL values
            (these are subordinate rows that often carry the manager name).
         3) Else, return empty string.
        """
        missing_mgr_id = str(missing_mgr_id).strip()
        # 1) check if present as an employee (rare, since missing means not in all_emp_ids)
        emp_row = get_emp_row(missing_mgr_id)
        if emp_row is not None:
            return str(emp_row.get(EMP_NAME_COL, "")).strip()

        # 2) look for subordinates that reference this manager id and read the manager name field
        if MGR_NAME_COL in enriched.columns:
            candidate_names = enriched.loc[enriched[MGR_ID_COL] == missing_mgr_id, MGR_NAME_COL] \
                .dropna().astype(str).str.strip()
            if not candidate_names.empty:
                # return most common (mode) name if available, else first
                try:
                    mode_name = candidate_names.mode()
                    if not mode_name.empty:
                        return mode_name.iloc[0]
                except Exception:
                    pass
                return candidate_names.iloc[0]
        # 3) nothing found
        return ""

    # For each employee row, walk up until MD or until first missing manager ID encountered
    for _, row in enriched.iterrows():
        # skip MDs
        if str(row.get(GCB_COL, "")).strip().upper() == "MD":
            continue

        current_mgr_id = str(row.get(MGR_ID_COL, "")).strip()
        missing_mgr_id = None
        missing_mgr_name = ""
        chain_parts = []  # for optional trace like "A -> B -> C (MISSING)"

        # If no manager id present at all, treat as missing immediately
        if current_mgr_id == "" or pd.isna(current_mgr_id):
            missing_mgr_id = current_mgr_id
            missing_mgr_name = row.get(MGR_NAME_COL, "") or ""
        else:
            # climb upwards until MD or missing manager
            visited = set()
            while current_mgr_id:
                # protect from infinite loops
                if current_mgr_id in visited:
                    # circularity - mark as missing (use current_mgr_id)
                    missing_mgr_id = current_mgr_id
                    missing_mgr_name = resolve_missing_manager_name(current_mgr_id)
                    chain_parts.append(f"{missing_mgr_name or missing_mgr_id} (CIRCULAR)")
                    break
                visited.add(current_mgr_id)

                chain_parts.append(current_mgr_id)

                # if manager id not found in enriched -> this is the first missing manager
                if current_mgr_id not in all_emp_ids:
                    missing_mgr_id = current_mgr_id
                    missing_mgr_name = resolve_missing_manager_name(current_mgr_id)
                    chain_parts[-1] = f"{missing_mgr_name or missing_mgr_id} (MISSING)"
                    break

                # otherwise get manager row and move up
                mgr_row = get_emp_row(current_mgr_id)
                if mgr_row is None:
                    # should not happen (we checked membership), but safe-break
                    missing_mgr_id = current_mgr_id
                    missing_mgr_name = resolve_missing_manager_name(current_mgr_id)
                    break

                mgr_gcb = str(mgr_row.get(GCB_COL, "")).strip().upper()
                # if this manager is MD, chain completes successfully
                if mgr_gcb == "MD":
                    missing_mgr_id = None
                    missing_mgr_name = ""
                    break

                # else continue up
                next_mgr = mgr_row.get(MGR_ID_COL, "")
                current_mgr_id = str(next_mgr).strip() if pd.notna(next_mgr) else ""
            # end while

        if missing_mgr_id:
            rec = row.to_dict()
            rec["Missing Manager ID"] = missing_mgr_id
            rec["Missing Manager Name"] = missing_mgr_name
            rec["Missing Chain"] = " -> ".join(chain_parts)
            exceptions.append(rec)

    # Convert to DataFrames
    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Single consolidated Summary sheet --------
    summary_tables = []

    # overall counts
    summary = pd.DataFrame([{
        "Metric": "Total Employees", "Value": len(enriched)
    }, {
        "Metric": "Total in Hierarchy", "Value": hierarchy_df[EMP_ID_COL].nunique()
    }, {
        "Metric": "Missing Managers Count", "Value": len(exceptions_df)
    }])
    summary_tables.append(("Overall Summary", summary))

    # per-MD
    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]
    summary_tables.append(("Headcount per MD", per_md))

    # ---- Prepare a safe manager-name map from enriched (string keys) ----
    manager_name_map = {}
    if EMP_ID_COL in enriched.columns and EMP_NAME_COL in enriched.columns:
        mgr_df = enriched[[EMP_ID_COL, EMP_NAME_COL]].drop_duplicates().copy()
        mgr_df[EMP_ID_COL] = mgr_df[EMP_ID_COL].astype(str).str.strip()
        mgr_df[EMP_NAME_COL] = mgr_df[EMP_NAME_COL].astype(str).str.strip()
        manager_name_map = mgr_df.set_index(EMP_ID_COL)[EMP_NAME_COL].to_dict()

    # Ensure Manager GCB column exists
    if "Manager GCB" not in hierarchy_df.columns:
        hierarchy_df["Manager GCB"] = pd.NA

    mgr_gcb_numeric = pd.to_numeric(hierarchy_df["Manager GCB"], errors="coerce")

    # -------------------------------
    # Per-GCB3 direct report counts
    # -------------------------------
    per_gcb3_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 3]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb3_counts["Manager ID"] = per_gcb3_counts["Manager ID"].astype(str).str.strip()
    per_gcb3_counts["GCB3 Name"] = per_gcb3_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb3 = per_gcb3_counts[["GCB3 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb3.columns = ["GCB3 Name", "GCB3 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB3", per_gcb3))

    # -------------------------------
    # Per-GCB4 direct report counts
    # -------------------------------
    per_gcb4_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 4]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb4_counts["Manager ID"] = per_gcb4_counts["Manager ID"].astype(str).str.strip()
    per_gcb4_counts["GCB4 Name"] = per_gcb4_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb4 = per_gcb4_counts[["GCB4 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb4.columns = ["GCB4 Name", "GCB4 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB4", per_gcb4))

    # -------------------------------
    # WRITE OUTPUT (Hierarchy + Missing Managers + ONE Summary sheet stacked)
    # -------------------------------
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        # Write all summaries stacked in one sheet with title rows
        start_row = 0
        for title, df in summary_tables:
            pd.DataFrame([[title]]).to_excel(writer, index=False, header=False,
                                             sheet_name="Summary", startrow=start_row)
            start_row += 1
            df.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(df) + 2

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


In [None]:
# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager."""
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row[MGR_NAME_COL],
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": None,
            "Manager Name": None,
            "Manager GCB": None,
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # -------------------------------
    # Exceptions: trace up the chain until first missing manager is found
    # -------------------------------
    all_emp_ids = set(enriched[EMP_ID_COL])

    for _, row in enriched.iterrows():
        if row[GCB_COL] == "MD":
            continue  # MDs are top, never exceptions

        current_mgr_id = row[MGR_ID_COL]
        missing_mgr_id = None
        missing_mgr_name = None

        # climb upwards until we either reach MD or hit a missing manager
        while pd.notna(current_mgr_id):
            if current_mgr_id not in all_emp_ids:
                missing_mgr_id = current_mgr_id
                # take the manager name field from this row (the direct manager not found)
                missing_mgr_name = row.get(MGR_NAME_COL, None)
                break

            # move to next manager in chain
            mgr_row = enriched.loc[enriched[EMP_ID_COL] == current_mgr_id]
            if mgr_row.empty:
                break
            current_mgr_id = mgr_row.iloc[0][MGR_ID_COL]

        if missing_mgr_id:
            rec = row.to_dict()
            rec["Missing Manager ID"] = missing_mgr_id
            rec["Missing Manager Name"] = missing_mgr_name
            exceptions.append(rec)

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Single consolidated Summary sheet --------
    summary_tables = []

    # overall counts
    summary = pd.DataFrame([{
        "Metric": "Total Employees", "Value": len(enriched)
    }, {
        "Metric": "Total in Hierarchy", "Value": hierarchy_df[EMP_ID_COL].nunique()
    }, {
        "Metric": "Missing Managers Count", "Value": len(exceptions_df)
    }])
    summary_tables.append(("Overall Summary", summary))

    # per-MD
    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]
    summary_tables.append(("Headcount per MD", per_md))

    # ---- Prepare a safe manager-name map from enriched (string keys) ----
    manager_name_map = {}
    if EMP_ID_COL in enriched.columns and EMP_NAME_COL in enriched.columns:
        mgr_df = enriched[[EMP_ID_COL, EMP_NAME_COL]].drop_duplicates().copy()
        mgr_df[EMP_ID_COL] = mgr_df[EMP_ID_COL].astype(str).str.strip()
        mgr_df[EMP_NAME_COL] = mgr_df[EMP_NAME_COL].astype(str).str.strip()
        manager_name_map = mgr_df.set_index(EMP_ID_COL)[EMP_NAME_COL].to_dict()

    # Make sure Manager GCB column exists in hierarchy_df (if not, create safe NA column)
    if "Manager GCB" not in hierarchy_df.columns:
        hierarchy_df["Manager GCB"] = pd.NA

    # Convert Manager GCB to numeric where possible for correct matching (coerce errors)
    mgr_gcb_numeric = pd.to_numeric(hierarchy_df["Manager GCB"], errors="coerce")

    # -------------------------------
    # Per-GCB3: direct report counts
    # -------------------------------
    per_gcb3_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 3]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb3_counts["Manager ID"] = per_gcb3_counts["Manager ID"].astype(str).str.strip()
    per_gcb3_counts["GCB3 Name"] = per_gcb3_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb3 = per_gcb3_counts[["GCB3 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb3.columns = ["GCB3 Name", "GCB3 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB3", per_gcb3))

    # -------------------------------
    # Per-GCB4: direct report counts
    # -------------------------------
    per_gcb4_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 4]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb4_counts["Manager ID"] = per_gcb4_counts["Manager ID"].astype(str).str.strip()
    per_gcb4_counts["GCB4 Name"] = per_gcb4_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb4 = per_gcb4_counts[["GCB4 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb4.columns = ["GCB4 Name", "GCB4 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB4", per_gcb4))

    # -------------------------------
    # WRITE OUTPUT (Hierarchy + Missing Managers + ONE Summary sheet stacked)
    # -------------------------------
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        # Write all summaries stacked in one sheet with title rows
        start_row = 0
        for title, df in summary_tables:
            pd.DataFrame([[title]]).to_excel(writer, index=False, header=False,
                                             sheet_name="Summary", startrow=start_row)
            start_row += 1
            df.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(df) + 2

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


In [None]:
# -------------------------
# Phase 2: Build Hierarchy (updated missing-manager tracing)
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions (improved missing-manager tracing)."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    # NORMALIZE key columns to strings for reliable matching
    enriched = enriched.copy()
    enriched[EMP_ID_COL] = enriched[EMP_ID_COL].astype(str).str.strip()
    # If MGR_ID_COL not present, create empty column
    if MGR_ID_COL not in enriched.columns:
        enriched[MGR_ID_COL] = ""
    else:
        enriched[MGR_ID_COL] = enriched[MGR_ID_COL].astype(str).str.strip()
    # Normalize names and GCB for safety
    enriched[EMP_NAME_COL] = enriched[EMP_NAME_COL].astype(str).str.strip()
    if MGR_NAME_COL in enriched.columns:
        enriched[MGR_NAME_COL] = enriched[MGR_NAME_COL].astype(str).str.strip()
    if GCB_COL in enriched.columns:
        enriched[GCB_COL] = enriched[GCB_COL].astype(str).str.strip().str.upper()

    # Build lookup using normalized Employee ID
    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager (manager_id must be normalized string)."""
        if manager_id is None:
            return
        manager_id = str(manager_id).strip()
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row.get(MGR_NAME_COL, ""),
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs (normalized)
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself (full columns)
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": "",
            "Manager Name": "",
            "Manager GCB": "",
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # ------------------------------
    # Exceptions: improved missing-manager tracing
    # ------------------------------
    # helper: find first missing manager in the upward chain for a given employee row
    def find_first_missing_manager(emp_row):
        """
        Walk upwards from emp_row[MGR_ID_COL] and:
         - return (missing_mgr_id, missing_mgr_name, chain_str) when first missing manager encountered
         - return (None, None, chain_str) if chain reaches MD (i.e. no missing found)
        """
        visited = set()
        chain = []  # collect tuples (id, name_or_empty)
        current_mgr = str(emp_row.get(MGR_ID_COL, "")).strip()

        while current_mgr:
            # avoid infinite loops
            if current_mgr in visited:
                chain.append((current_mgr, "(circular)"))
                return current_mgr, "", " -> ".join(f"{i}({n})" for i, n in chain)
            visited.add(current_mgr)

            # manager exists in enriched? continue upward
            if current_mgr in df_lookup.index:
                mgr_row = df_lookup.loc[current_mgr]
                mgr_name = str(mgr_row.get(EMP_NAME_COL, "")).strip()
                chain.append((current_mgr, mgr_name or ""))
                # if this manager is MD, chain completes successfully (no missing)
                mgr_gcb = str(mgr_row.get(GCB_COL, "")).strip().upper()
                if mgr_gcb == "MD":
                    # build chain string and indicate no missing
                    chain_str = " -> ".join(f"{n or i}" for i, n in chain)
                    return None, None, chain_str
                # move up
                next_mgr = mgr_row.get(MGR_ID_COL, "")
                current_mgr = str(next_mgr).strip() if pd.notna(next_mgr) else ""
            else:
                # current_mgr is not found in enriched -> this is the first missing manager
                # try to recover a name for this missing manager from any rows that reference them
                # (many employees might have MGR_ID == current_mgr and have the manager's name in MGR_NAME_COL)
                possible_names = enriched.loc[enriched[MGR_ID_COL] == current_mgr, MGR_NAME_COL] \
                                 if MGR_NAME_COL in enriched.columns else pd.Series([], dtype=object)
                possible_names = possible_names.dropna().astype(str).str.strip()
                missing_name = possible_names.mode().iloc[0] if not possible_names.empty else ""
                chain.append((current_mgr, missing_name))
                chain_str = " -> ".join(f"{n or i}" for i, n in chain) + " (MISSING)"
                return current_mgr, missing_name, chain_str

        # if we exit loop without encountering MD or missing (e.g., no manager)
        return None, None, "No manager chain"

    # Apply find_first_missing_manager to every employee row and capture exceptions
    for _, row in enriched.iterrows():
        missing_id, missing_name, chain_str = find_first_missing_manager(row)
        if missing_id:  # found a missing manager somewhere above this employee
            row_dict = row.to_dict()
            row_dict["Missing Manager ID"] = missing_id
            row_dict["Missing Manager Name"] = missing_name
            row_dict["Missing Chain"] = chain_str
            exceptions.append(row_dict)

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Single consolidated Summary sheet (unchanged logic) --------
    summary_tables = []

    # overall counts
    summary = pd.DataFrame([{
        "Metric": "Total Employees", "Value": len(enriched)
    }, {
        "Metric": "Total in Hierarchy", "Value": hierarchy_df[EMP_ID_COL].nunique()
    }, {
        "Metric": "Missing Managers Count", "Value": len(exceptions_df)
    }])
    summary_tables.append(("Overall Summary", summary))

    # per-MD
    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]
    summary_tables.append(("Headcount per MD", per_md))

    # ---- Prepare manager-name map ----
    manager_name_map = {}
    if EMP_ID_COL in enriched.columns and EMP_NAME_COL in enriched.columns:
        mgr_df = enriched[[EMP_ID_COL, EMP_NAME_COL]].drop_duplicates().copy()
        mgr_df[EMP_ID_COL] = mgr_df[EMP_ID_COL].astype(str).str.strip()
        mgr_df[EMP_NAME_COL] = mgr_df[EMP_NAME_COL].astype(str).str.strip()
        manager_name_map = mgr_df.set_index(EMP_ID_COL)[EMP_NAME_COL].to_dict()

    # Ensure Manager GCB column exists
    if "Manager GCB" not in hierarchy_df.columns:
        hierarchy_df["Manager GCB"] = pd.NA

    mgr_gcb_numeric = pd.to_numeric(hierarchy_df["Manager GCB"], errors="coerce")

    # -------------------------------
    # Per-GCB3 direct report counts
    # -------------------------------
    per_gcb3_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 3]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb3_counts["Manager ID"] = per_gcb3_counts["Manager ID"].astype(str).str.strip()
    per_gcb3_counts["GCB3 Name"] = per_gcb3_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb3 = per_gcb3_counts[["GCB3 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb3.columns = ["GCB3 Name", "GCB3 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB3", per_gcb3))

    # -------------------------------
    # Per-GCB4 direct report counts
    # -------------------------------
    per_gcb4_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 4]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb4_counts["Manager ID"] = per_gcb4_counts["Manager ID"].astype(str).str.strip()
    per_gcb4_counts["GCB4 Name"] = per_gcb4_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb4 = per_gcb4_counts[["GCB4 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb4.columns = ["GCB4 Name", "GCB4 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB4", per_gcb4))

    # -------------------------------
    # WRITE OUTPUT
    # -------------------------------
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        # Write summaries stacked in one sheet
        start_row = 0
        for title, df in summary_tables:
            pd.DataFrame([[title]]).to_excel(writer, index=False, header=False,
                                             sheet_name="Summary", startrow=start_row)
            start_row += 1
            df.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(df) + 2

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


In [None]:
# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager."""
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row[MGR_NAME_COL],
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": None,
            "Manager Name": None,
            "Manager GCB": None,
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # Exceptions: employees whose manager ID not in employee list
    all_emp_ids = set(enriched[EMP_ID_COL])
    missing_mgr_ids = set(enriched[MGR_ID_COL]) - all_emp_ids
    for _, row in enriched[enriched[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if row[GCB_COL] != "MD":
            exceptions.append(row.to_dict())

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Single consolidated Summary sheet --------
      # -------- Single consolidated Summary sheet (REPLACE EXISTING SUMMARY BLOCK WITH THIS) --------
    summary_tables = []

    # overall counts
    summary = pd.DataFrame([{
        "Metric": "Total Employees", "Value": len(enriched)
    }, {
        "Metric": "Total in Hierarchy", "Value": hierarchy_df[EMP_ID_COL].nunique()
    }, {
        "Metric": "Missing Managers Count", "Value": len(exceptions_df)
    }])
    summary_tables.append(("Overall Summary", summary))

    # per-MD
    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]
    summary_tables.append(("Headcount per MD", per_md))

    # ---- Prepare a safe manager-name map from enriched (string keys) ----
    manager_name_map = {}
    if EMP_ID_COL in enriched.columns and EMP_NAME_COL in enriched.columns:
        mgr_df = enriched[[EMP_ID_COL, EMP_NAME_COL]].drop_duplicates().copy()
        mgr_df[EMP_ID_COL] = mgr_df[EMP_ID_COL].astype(str).str.strip()
        mgr_df[EMP_NAME_COL] = mgr_df[EMP_NAME_COL].astype(str).str.strip()
        manager_name_map = mgr_df.set_index(EMP_ID_COL)[EMP_NAME_COL].to_dict()

    # Make sure Manager GCB column exists in hierarchy_df (if not, create safe NA column)
    if "Manager GCB" not in hierarchy_df.columns:
        hierarchy_df["Manager GCB"] = pd.NA

    # Convert Manager GCB to numeric where possible for correct matching (coerce errors)
    mgr_gcb_numeric = pd.to_numeric(hierarchy_df["Manager GCB"], errors="coerce")

    # -------------------------------
    # Per-GCB3: direct report counts
    # -------------------------------
    per_gcb3_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 3]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )

    # normalize Manager ID as string to map reliably
    per_gcb3_counts["Manager ID"] = per_gcb3_counts["Manager ID"].astype(str).str.strip()
    per_gcb3_counts["GCB3 Name"] = per_gcb3_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb3 = per_gcb3_counts[["GCB3 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb3.columns = ["GCB3 Name", "GCB3 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB3", per_gcb3))

    # -------------------------------
    # Per-GCB4: direct report counts
    # -------------------------------
    per_gcb4_counts = (
        hierarchy_df.loc[mgr_gcb_numeric == 4]
        .groupby("Manager ID")[EMP_ID_COL]
        .nunique()
        .reset_index(name="Direct Reports")
    )
    per_gcb4_counts["Manager ID"] = per_gcb4_counts["Manager ID"].astype(str).str.strip()
    per_gcb4_counts["GCB4 Name"] = per_gcb4_counts["Manager ID"].map(manager_name_map).fillna("")
    per_gcb4 = per_gcb4_counts[["GCB4 Name", "Manager ID", "Direct Reports"]].copy()
    per_gcb4.columns = ["GCB4 Name", "GCB4 ID", "Direct Reports"]
    summary_tables.append(("Direct Reports per GCB4", per_gcb4))

    # -------------------------------
    # WRITE OUTPUT (Hierarchy + Missing Managers + ONE Summary sheet stacked)
    # -------------------------------
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        # Write all summaries stacked in one sheet with title rows
        start_row = 0
        for title, df in summary_tables:
            # header/title row
            pd.DataFrame([[title]]).to_excel(writer, index=False, header=False,
                                             sheet_name="Summary", startrow=start_row)
            start_row += 1
            df.to_excel(writer, index=False, sheet_name="Summary", startrow=start_row)
            start_row += len(df) + 2  # leave a gap

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


In [2]:
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
from pptx.enum.shapes import MSO_SHAPE

# Create presentation
prs = Presentation()
slide_layout = prs.slide_layouts[6]  # blank slide
slide = prs.slides.add_slide(slide_layout)

# Title
left, top, width, height = Inches(0.5), Inches(0.2), Inches(9), Inches(1)
textbox = slide.shapes.add_textbox(left, top, width, height)
tf = textbox.text_frame
tf.text = "Cost Template Automation – Saving 144 Hours Annually"
p = tf.paragraphs[0]
p.font.size = Pt(28)
p.font.bold = True
p.font.color.rgb = RGBColor(0, 51, 102)

# Process flow boxes
process_steps = [
    ("Step 1: Data Preparation\nTool: CostTemplate_Data_Prep_Tool_KS", 0.5),
    ("Step 2: Report Generation\nTool: Cost_Templates_Gen_Tool_KS", 3.5),
    ("Step 3: Finalization\nTool: PasteAsValuesUtility", 6.5)
]

for text, left_in in process_steps:
    shape = slide.shapes.add_shape(
        MSO_SHAPE.ROUNDED_RECTANGLE,
        Inches(left_in), Inches(2), Inches(2.8), Inches(1.5)
    )
    shape.fill.solid()
    shape.fill.fore_color.rgb = RGBColor(91, 155, 213)
    shape.text = text
    for p in shape.text_frame.paragraphs:
        p.font.size = Pt(14)
        p.font.color.rgb = RGBColor(255, 255, 255)
        p.alignment = 1  # center

# Add arrows between steps
for i in range(2):
    slide.shapes.add_connector(
        1, Inches(3.3 + i*3), Inches(2.75), Inches(3.5 + i*3), Inches(2.75)
    )

# Benefits box
shape = slide.shapes.add_shape(
    MSO_SHAPE.RECTANGLE,
    Inches(0.5), Inches(4), Inches(8.5), Inches(2)
)
shape.fill.solid()
shape.fill.fore_color.rgb = RGBColor(237, 125, 49)
shape.text = (
    "Benefits:\n"
    "• Saves 12 hours per month (~144 hours annually)\n"
    "• Eliminates manual TM1 refreshes & errors\n"
    "• Ensures consistent, validated dashboards\n"
    "• Ready-to-share lightweight reports"
)
for p in shape.text_frame.paragraphs:
    p.font.size = Pt(16)
    p.font.color.rgb = RGBColor(255, 255, 255)

# Save file
prs.save("Cost_Automation_OnePager.pptx")
print("File Saved")

File Saved


In [None]:
import pandas as pd

# -------------------------
# Constants
# -------------------------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"   # must exist in monthly.csv
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"


# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase_one(gha_file, monthly_file, output_file="phase1_enriched.xlsx"):
    """Enrich monthly CSV with GHA details."""

    # Read monthly file (CSV may have special encoding)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    gha = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Strip column names
    monthly.columns = monthly.columns.str.strip()
    gha.columns = gha.columns.str.strip()

    # Select needed columns from GHA (add more here if needed)
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Merge monthly + gha
    merged = monthly.merge(
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Add Manager GCB by merging again on Manager ID
    mgr_gcb = gha_subset[[EMP_ID_COL, GCB_COL]].rename(
        columns={EMP_ID_COL: MGR_ID_COL, GCB_COL: "Manager GCB"}
    )
    merged = merged.merge(mgr_gcb, on=MGR_ID_COL, how="left")

    # Capture missing GHA matches
    missing = merged[merged[GCB_COL].isna()]

    # Save Phase 1
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 completed → {output_file}")
    return merged


# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def phase_two(input_file="phase1_enriched.xlsx", output_file="hierarchy_report.xlsx"):
    """Build flattened hierarchy with unlimited levels + summary + exceptions."""

    enriched = pd.read_excel(input_file, sheet_name="Enriched")
    enriched.columns = enriched.columns.str.strip()

    df_lookup = enriched.set_index(EMP_ID_COL)

    hierarchy = []
    exceptions = []

    def recurse(manager_id, path):
        """Recursively walk down the hierarchy from a manager."""
        reports = enriched[enriched[MGR_ID_COL] == manager_id]
        for _, row in reports.iterrows():
            record = path.copy()
            record.update({
                "Manager ID": manager_id,
                "Manager Name": row[MGR_NAME_COL],
                "Manager GCB": df_lookup.loc[manager_id, GCB_COL]
                if manager_id in df_lookup.index else None,
            })
            # keep ALL columns from enriched
            record.update(row.to_dict())
            hierarchy.append(record)

            # recurse further down
            recurse(row[EMP_ID_COL], record)

    # Find all MDs
    mds = enriched[enriched[GCB_COL] == "MD"]
    for _, md in mds.iterrows():
        md_path = {"MD ID": md[EMP_ID_COL], "MD Name": md[EMP_NAME_COL]}

        # add MD itself
        row_dict = md.to_dict()
        row_dict.update({
            "MD ID": md[EMP_ID_COL],
            "MD Name": md[EMP_NAME_COL],
            "Manager ID": None,
            "Manager Name": None,
            "Manager GCB": None,
        })
        hierarchy.append(row_dict)

        # recurse into MD’s reports
        recurse(md[EMP_ID_COL], md_path)

    # Exceptions: employees whose manager ID not in employee list
    all_emp_ids = set(enriched[EMP_ID_COL])
    missing_mgr_ids = set(enriched[MGR_ID_COL]) - all_emp_ids
    for _, row in enriched[enriched[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if row[GCB_COL] != "MD":
            exceptions.append(row.to_dict())

    hierarchy_df = pd.DataFrame(hierarchy)
    exceptions_df = pd.DataFrame(exceptions)

    # -------- Summary sheet --------
    summary = {}
    summary["Total Employees"] = len(enriched)
    summary["Total in Hierarchy"] = hierarchy_df[EMP_ID_COL].nunique()
    summary["Missing Managers Count"] = len(exceptions_df)

    per_md = hierarchy_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index()
    per_md.columns = ["MD Name", "Headcount"]

    per_gcb3 = hierarchy_df[hierarchy_df[GCB_COL] == 3] \
        .groupby(EMP_NAME_COL)[EMP_ID_COL].count().reset_index()
    per_gcb3.columns = ["GCB3 Name", "Direct Reports"]

    per_gcb4 = hierarchy_df[hierarchy_df[GCB_COL] == 4] \
        .groupby(EMP_NAME_COL)[EMP_ID_COL].count().reset_index()
    per_gcb4.columns = ["GCB4 Name", "Direct Reports"]

    # Save all outputs
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        hierarchy_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

        pd.DataFrame([summary]).to_excel(writer, index=False, sheet_name="Summary")
        per_md.to_excel(writer, index=False, sheet_name="MD Headcount")
        per_gcb3.to_excel(writer, index=False, sheet_name="GCB3 Reports")
        per_gcb4.to_excel(writer, index=False, sheet_name="GCB4 Reports")

    print(f"✅ Phase 2 completed → {output_file}")
    return hierarchy_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    gha_file = "GHA.xlsx"          # input GHA file
    csv_file = "Monthly.csv"       # input monthly file
    output_file_phase1 = "phase1_enriched.xlsx"
    final_output = "hierarchy_report.xlsx"

    print("Running Phase 1...")
    phase_one(gha_file, csv_file, output_file_phase1)

    print("Running Phase 2...")
    phase_two(output_file_phase1, final_output)

    print("✅ Processing complete!")


In [None]:
import pandas as pd

# -------------------------
# Constants
# -------------------------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"   # must exist in monthly.csv
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"

# -------------------------
# Phase 1: Enrichment
# -------------------------
def enrich_monthly_with_gha(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    """Enrich monthly CSV with GHA details."""

    # Read monthly file (CSV may have special encoding)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    gha = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Strip column names
    monthly.columns = monthly.columns.str.strip()
    gha.columns = gha.columns.str.strip()

    # Select needed columns from GHA (extend as needed)
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Merge monthly + gha
    merged = monthly.merge(
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Add Manager GCB by merging again on Manager ID
    mgr_gcb = gha_subset[[EMP_ID_COL, GCB_COL]].rename(
        columns={EMP_ID_COL: MGR_ID_COL, GCB_COL: "Direct Manager GCB"}
    )
    merged = merged.merge(mgr_gcb, on=MGR_ID_COL, how="left")

    # Capture missing GHA matches
    missing = merged[merged[GCB_COL].isna()]

    # Save Phase 1
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 completed → {output_file}")
    return merged


# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def build_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    """Build flattened hierarchy with MD → GCB3 → Managers → Employees."""

    df_lookup = enriched_df.set_index(EMP_ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[EMP_ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                row = {
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": "",
                    "Direct Manager Name": emp[MGR_NAME_COL],
                }
                row.update(emp.to_dict())   # keep all monthly+gha cols
                final_rows.append(row)
            continue

        # Process each GCB3
        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[EMP_ID_COL]

            # Add GCB3’s own row
            row = {
                "MD Name": md[EMP_NAME_COL],
                "MD ID": md[EMP_ID_COL],
                "GCB3 Name": gcb3[EMP_NAME_COL],
                "Direct Manager Name": gcb3[MGR_NAME_COL],
            }
            row.update(gcb3.to_dict())
            final_rows.append(row)

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[EMP_ID_COL]

                row = {
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": gcb3[EMP_NAME_COL],
                    "Direct Manager Name": rm[MGR_NAME_COL],
                }
                row.update(rm.to_dict())
                final_rows.append(row)

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    row = {
                        "MD Name": md[EMP_NAME_COL],
                        "MD ID": md[EMP_ID_COL],
                        "GCB3 Name": gcb3[EMP_NAME_COL],
                        "Direct Manager Name": rm[EMP_NAME_COL],
                    }
                    row.update(emp.to_dict())
                    final_rows.append(row)

    # Exceptions: employees with missing managers (excluding MDs)
    all_emp_ids = set(enriched_df[EMP_ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Direct Manager Name", EMP_NAME_COL],
        inplace=True,
        na_position="last"
    )

    # -------------------------
    # Build summary sheet
    # -------------------------
    summary_data = {
        "Metric": [
            "Total employees in enriched file",
            "Employees missing in GHA",
            "Employees with missing manager (non-MD)"
        ],
        "Count": [
            len(enriched_df),
            sum(enriched_df[GCB_COL].isna()),
            len(exceptions_df)
        ]
    }
    summary_df = pd.DataFrame(summary_data)

    # Per MD
    per_md = final_df.groupby("MD Name")[EMP_ID_COL].nunique().reset_index(name="Employees under MD")
    # Per GCB3
    per_gcb3 = final_df.groupby("GCB3 Name")[EMP_ID_COL].nunique().reset_index(name="Employees under GCB3")

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")
        summary_df.to_excel(writer, index=False, sheet_name="Summary")
        per_md.to_excel(writer, index=False, sheet_name="Summary", startrow=len(summary_df)+2)
        per_gcb3.to_excel(writer, index=False, sheet_name="Summary", startrow=len(summary_df)+len(per_md)+5)

    print(f"✅ Phase 2 completed → {output_file}")
    return final_df, exceptions_df, summary_df


In [None]:
import pandas as pd

# -------------------------
# Constants
# -------------------------
EMP_ID_COL = "Employee ID"
EMP_NAME_COL = "Employee Name"
EMP_EMAIL_COL = "Employee Business Email Address"
MGR_ID_COL = "Entity Manager Employee ID"   # must exist in monthly.csv
MGR_NAME_COL = "Entity Manager Employee Name"
GCB_COL = "Global Career Band"

# -------------------------
# Phase 1: Enrichment
# -------------------------
def enrich_monthly_with_gha(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    """Enrich monthly CSV with GHA details."""

    # Read monthly file (CSV may have special encoding)
    monthly = pd.read_csv(monthly_file, encoding="ISO-8859-1")
    gha = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Strip column names
    monthly.columns = monthly.columns.str.strip()
    gha.columns = gha.columns.str.strip()

    # Select needed columns from GHA (add more here if needed)
    gha_subset = gha[
        [
            EMP_ID_COL,
            EMP_NAME_COL,
            EMP_EMAIL_COL,
            GCB_COL,
            "Company",
            "Department",
            "Job Function",
            "Legal Entity Name",
            "Employee Status"
        ]
    ].drop_duplicates()

    # Merge monthly + gha
    merged = monthly.merge(
        gha_subset,
        on=EMP_ID_COL,
        how="left",
        suffixes=("", "_GHA")
    )

    # Add Manager GCB by merging again on Manager ID
    mgr_gcb = gha_subset[[EMP_ID_COL, GCB_COL]].rename(
        columns={EMP_ID_COL: MGR_ID_COL, GCB_COL: "Manager GCB"}
    )
    merged = merged.merge(mgr_gcb, on=MGR_ID_COL, how="left")

    # Capture missing GHA matches
    missing = merged[merged[GCB_COL].isna()]

    # Save Phase 1
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 completed → {output_file}")
    return merged


# -------------------------
# Phase 2: Build Hierarchy
# -------------------------
def build_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    """Build flattened hierarchy with MD → GCB3 → Managers → Employees."""

    df_lookup = enriched_df.set_index(EMP_ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[EMP_ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": "",
                    "Reporting Manager Name": emp[MGR_NAME_COL],
                    "Reporting Manager GCB": emp.get("Manager GCB", ""),
                    **emp.to_dict()
                })
            continue

        # Process each GCB3
        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[EMP_ID_COL]

            # Add GCB3’s own row
            final_rows.append({
                "MD Name": md[EMP_NAME_COL],
                "MD ID": md[EMP_ID_COL],
                "GCB3 Name": gcb3[EMP_NAME_COL],
                "Reporting Manager Name": gcb3[MGR_NAME_COL],
                "Reporting Manager GCB": gcb3.get("Manager GCB", ""),
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[EMP_ID_COL]

                final_rows.append({
                    "MD Name": md[EMP_NAME_COL],
                    "MD ID": md[EMP_ID_COL],
                    "GCB3 Name": gcb3[EMP_NAME_COL],
                    "Reporting Manager Name": rm[EMP_NAME_COL],
                    "Reporting Manager GCB": rm.get(GCB_COL, ""),
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[EMP_NAME_COL],
                        "MD ID": md[EMP_ID_COL],
                        "GCB3 Name": gcb3[EMP_NAME_COL],
                        "Reporting Manager Name": rm[EMP_NAME_COL],
                        "Reporting Manager GCB": rm.get(GCB_COL, ""),
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (excluding MDs)
    all_emp_ids = set(enriched_df[EMP_ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", EMP_NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 completed → {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = enrich_monthly_with_gha(monthly_file, gha_file)
    build_hierarchy(enriched)


In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [None]:
import pandas as pd

# -------------------------
# Phase 1: Enrichment
# -------------------------
def phase1_enrich(monthly_file, gha_file, output_file="phase1_enriched.xlsx"):
    # Read files
    monthly_df = pd.read_csv(monthly_file)
    gha_df = pd.read_excel(gha_file, sheet_name="Headcount - Employee Detail")

    # Clean column names
    monthly_df.columns = monthly_df.columns.str.strip()
    gha_df.columns = gha_df.columns.str.strip()

    # Select needed GHA columns
    gha_keep = [
        "Employee ID",
        "Employee Name",
        "Employee Business Email Address",
        "Global Career Band",
        "Legal Entity"
    ]
    gha_df = gha_df[gha_keep]

    # Merge
    merged_df = monthly_df.merge(
        gha_df,
        on="Employee ID",
        how="left",
        suffixes=("", "_GHA")
    )

    # Capture missing matches
    missing = merged_df[merged_df["Global Career Band"].isna()]

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        merged_df.to_excel(writer, index=False, sheet_name="Enriched")
        if not missing.empty:
            missing.to_excel(writer, index=False, sheet_name="Missing in GHA")

    print(f"✅ Phase 1 done. Saved to {output_file}")
    return merged_df


# -------------------------
# Phase 2: Flattened Hierarchy
# -------------------------
def phase2_hierarchy(enriched_df, output_file="phase2_hierarchy.xlsx"):
    ID_COL = "Employee ID"
    NAME_COL = "Employee Name"
    EMAIL_COL = "Employee Business Email Address"
    MGR_ID_COL = "Manager Employee ID"
    GCB_COL = "Global Career Band"

    # Lookup for employees
    df_lookup = enriched_df.set_index(ID_COL)

    final_rows = []
    exceptions = []

    def get_emp(emp_id):
        try:
            return df_lookup.loc[emp_id]
        except KeyError:
            return None

    # Find all MDs
    mds = enriched_df[enriched_df[GCB_COL] == "MD"]

    # Process each MD
    for _, md in mds.iterrows():
        md_id = md[ID_COL]

        # Find GCB3s under MD
        gcb3s = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] == 3)]

        if gcb3s.empty:
            # Employees directly under MD
            direct_emps = enriched_df[(enriched_df[MGR_ID_COL] == md_id) & (enriched_df[GCB_COL] != 3)]
            for _, emp in direct_emps.iterrows():
                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": "",
                    "GCB3 ID": "",
                    "GCB3 Email": "",
                    "Reporting Manager Name": "",
                    "Reporting Manager ID": "",
                    "Reporting Manager Email": "",
                    **emp.to_dict()
                })
            continue

        for _, gcb3 in gcb3s.iterrows():
            gcb3_id = gcb3[ID_COL]

            # GCB3’s own row
            final_rows.append({
                "MD Name": md[NAME_COL],
                "MD ID": md[ID_COL],
                "MD Email": md[EMAIL_COL],
                "GCB3 Name": gcb3[NAME_COL],
                "GCB3 ID": gcb3[ID_COL],
                "GCB3 Email": gcb3[EMAIL_COL],
                "Reporting Manager Name": "",
                "Reporting Manager ID": "",
                "Reporting Manager Email": "",
                **gcb3.to_dict()
            })

            # Managers (GCB3/4) under this GCB3
            rm_level = enriched_df[enriched_df[MGR_ID_COL] == gcb3_id]
            for _, rm in rm_level.iterrows():
                rm_id = rm[ID_COL]

                final_rows.append({
                    "MD Name": md[NAME_COL],
                    "MD ID": md[ID_COL],
                    "MD Email": md[EMAIL_COL],
                    "GCB3 Name": gcb3[NAME_COL],
                    "GCB3 ID": gcb3[ID_COL],
                    "GCB3 Email": gcb3[EMAIL_COL],
                    "Reporting Manager Name": rm[NAME_COL],
                    "Reporting Manager ID": rm[ID_COL],
                    "Reporting Manager Email": rm[EMAIL_COL],
                    **rm.to_dict()
                })

                # Employees under this RM
                emps = enriched_df[enriched_df[MGR_ID_COL] == rm_id]
                for _, emp in emps.iterrows():
                    final_rows.append({
                        "MD Name": md[NAME_COL],
                        "MD ID": md[ID_COL],
                        "MD Email": md[EMAIL_COL],
                        "GCB3 Name": gcb3[NAME_COL],
                        "GCB3 ID": gcb3[ID_COL],
                        "GCB3 Email": gcb3[EMAIL_COL],
                        "Reporting Manager Name": rm[NAME_COL],
                        "Reporting Manager ID": rm[ID_COL],
                        "Reporting Manager Email": rm[EMAIL_COL],
                        **emp.to_dict()
                    })

    # Exceptions: employees with missing managers (but not MDs)
    all_emp_ids = set(enriched_df[ID_COL])
    all_mgr_ids = set(enriched_df[MGR_ID_COL])
    missing_mgr_ids = all_mgr_ids - all_emp_ids

    for _, emp in enriched_df[enriched_df[MGR_ID_COL].isin(missing_mgr_ids)].iterrows():
        if emp[GCB_COL] != "MD":
            exceptions.append(emp.to_dict())

    # Convert to DataFrames
    final_df = pd.DataFrame(final_rows)
    exceptions_df = pd.DataFrame(exceptions)

    # Sorting
    final_df.sort_values(
        by=["MD Name", "GCB3 Name", "Reporting Manager Name", NAME_COL],
        inplace=True,
        na_position="last"
    )

    # Save
    with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
        final_df.to_excel(writer, index=False, sheet_name="Hierarchy Report")
        if not exceptions_df.empty:
            exceptions_df.to_excel(writer, index=False, sheet_name="Missing Managers")

    print(f"✅ Phase 2 done. Flattened hierarchy saved to {output_file}")
    return final_df, exceptions_df


# -------------------------
# Run pipeline
# -------------------------
if __name__ == "__main__":
    monthly_file = "monthly.csv"
    gha_file = "gha.xlsx"

    enriched = phase1_enrich(monthly_file, gha_file)
    phase2_hierarchy(enriched)


In [17]:
import pandas as pd

# Sample data
data = [
    ["101", "Alice",    "5", "201", "Bob"],
    ["201", "Bob",      "4", "301", "Charlie"],
    ["301", "Charlie",  "3", "401", "Dana"],
    ["401", "Dana",     "2", None,  None],
    ["302", "Eva",      "5", "201", "Bob"],
    ["304", "Kirti",    "5", "305", "Inish"],
    ["307", "Anshul",   "4", "305", "Inish"],
    ["308", "Sowmya",   "5", "307", "Anshul"],
    ["305", "Inish",    "3", "306", "Vilma"],
    ["306", "Vilma",    "2", "504", "Abhishek"],
]

columns = [
    "Employee ID", "Employee Name", "Global Career Band",
    "Entity Manager Employee ID", "Entity Manager Employee Name"
]

df = pd.DataFrame(data, columns=columns)

# Create lookup dictionary
employee_lookup = df.set_index("Employee ID").to_dict("index")

# Add GCB 4 and GCB 3 output columns
df["Entity Manager Employee ID_GCB 4"] = ""
df["Entity Manager Employee Name_GCB 4"] = ""
df["Entity Manager Employee ID_GCB 3"] = ""
df["Entity Manager Employee Name_GCB 3"] = ""

# 🔁 Updated logic: Climb upward until GCB 4 & 3 found
def trace_managers_gcb_4_and_3(start_id):
    gcb4 = None
    gcb3 = None
    visited = []

    current_id = start_id
    while current_id and current_id in employee_lookup:
        visited.append(current_id)
        manager = employee_lookup[current_id]
        gcb = str(manager.get("Global Career Band", "")).strip()

        if not gcb4 and gcb == "4":
            gcb4 = (current_id, manager.get("Employee Name"))
        elif not gcb3 and gcb == "3":
            gcb3 = (current_id, manager.get("Employee Name"))
        
        # Stop if both found
        if gcb4 and gcb3:
            break

        current_id = manager.get("Entity Manager Employee ID")

    return gcb4, gcb3

# Apply to each row
for i in df.index:
    row = df.loc[i]
    mgr_id = row["Entity Manager Employee ID"]

    if pd.notna(mgr_id):
        gcb4, gcb3 = trace_managers_gcb_4_and_3(mgr_id)

        if gcb4:
            df.loc[i, "Entity Manager Employee ID_GCB 4"] = gcb4[0]
            df.loc[i, "Entity Manager Employee Name_GCB 4"] = gcb4[1]

        if gcb3:
            df.loc[i, "Entity Manager Employee ID_GCB 3"] = gcb3[0]
            df.loc[i, "Entity Manager Employee Name_GCB 3"] = gcb3[1]

# ✅ Final output
print(df.to_string(index=False))


Employee ID Employee Name Global Career Band Entity Manager Employee ID Entity Manager Employee Name Entity Manager Employee ID_GCB 4 Entity Manager Employee Name_GCB 4 Entity Manager Employee ID_GCB 3 Entity Manager Employee Name_GCB 3
        101         Alice                  5                        201                          Bob                              201                                Bob                              301                            Charlie
        201           Bob                  4                        301                      Charlie                                                                                                  301                            Charlie
        301       Charlie                  3                        401                         Dana                                                                                                                                        
        401          Dana                  2        

In [18]:
df.to_excel("g1.xlsx", index=False)

In [None]:
wor