In [None]:
import tkinter as tk
from tkinter import filedialog, ttk

def master_input_window():
    result = {}

    def browse_file(entry_widget):
        file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xlsb *.xls")])
        if file_path:
            entry_widget.delete(0, tk.END)
            entry_widget.insert(0, file_path)

    def submit():
        result["file1"] = entry_file1.get()
        result["file2"] = entry_file2.get()
        result["file3"] = entry_file3.get()
        result["year"] = year_var.get()
        result["month"] = month_var.get()
        root.destroy()

    root = tk.Tk()
    root.title("Master Input Window")

    # File 1
    tk.Label(root, text="Select First Excel File:").grid(row=0, column=0, padx=5, pady=5, sticky="e")
    entry_file1 = tk.Entry(root, width=50)
    entry_file1.grid(row=0, column=1, padx=5, pady=5)
    tk.Button(root, text="Browse", command=lambda: browse_file(entry_file1)).grid(row=0, column=2, padx=5, pady=5)

    # File 2
    tk.Label(root, text="Select Second Excel File:").grid(row=1, column=0, padx=5, pady=5, sticky="e")
    entry_file2 = tk.Entry(root, width=50)
    entry_file2.grid(row=1, column=1, padx=5, pady=5)
    tk.Button(root, text="Browse", command=lambda: browse_file(entry_file2)).grid(row=1, column=2, padx=5, pady=5)

    # File 3
    tk.Label(root, text="Select Third Excel File:").grid(row=2, column=0, padx=5, pady=5, sticky="e")
    entry_file3 = tk.Entry(root, width=50)
    entry_file3.grid(row=2, column=1, padx=5, pady=5)
    tk.Button(root, text="Browse", command=lambda: browse_file(entry_file3)).grid(row=2, column=2, padx=5, pady=5)

    # Year dropdown
    tk.Label(root, text="Select Year:").grid(row=3, column=0, padx=5, pady=5, sticky="e")
    year_var = tk.StringVar()
    year_dropdown = ttk.Combobox(root, textvariable=year_var, values=[str(y) for y in range(2020, 2031)])
    year_dropdown.grid(row=3, column=1, padx=5, pady=5)
    year_dropdown.current(0)

    # Month dropdown
    tk.Label(root, text="Select Month:").grid(row=4, column=0, padx=5, pady=5, sticky="e")
    month_var = tk.StringVar()
    month_dropdown = ttk.Combobox(root, textvariable=month_var, values=[str(m) for m in range(1, 13)])
    month_dropdown.grid(row=4, column=1, padx=5, pady=5)
    month_dropdown.current(0)

    # Submit button
    tk.Button(root, text="Submit", command=submit).grid(row=5, column=1, pady=10)

    root.mainloop()
    return result


# Example usage:
user_inputs = master_input_window()
print("Returned values:", user_inputs)

file1 = user_inputs["file1"]
file2 = user_inputs["file2"]
file3 = user_inputs["file3"]
year = user_inputs["year"]
month = user_inputs["month"]

print(f"\nFile1: {file1}\nFile2: {file2}\nFile3: {file3}\nYear: {year}\nMonth: {month}")


In [None]:
# === Step 5: Apply formatting to ALL sheets ===
book = load_workbook(temp_xlsx)
header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")
header_font = Font(bold=True)

for sheet_name in book.sheetnames:   # 🔹 loop over ALL sheets
    ws = book[sheet_name]

    # Apply header formatting
    for cell in ws[1]:
        cell.font = header_font
        cell.fill = header_fill

    # Set column width = 25
    for col in ws.columns:
        col_letter = col[0].column_letter
        ws.column_dimensions[col_letter].width = 25

    # Add autofilter
    ws.auto_filter.ref = ws.dimensions

book.save(temp_xlsx)


In [None]:
import os
import shutil
import pandas as pd
from tkinter import Tk, filedialog
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill
import win32com.client as win32

def update_excel_with_xlsb(dfs_to_update, output_folder="Output"):
    """
    1. Ask user to select an input Excel file (.xlsb).
    2. Copy the file to Output folder with '_output' added.
    3. Replace/add sheets with given DataFrames.
    4. Save final file as .xlsb via Excel COM (to keep size small).
    """

    # === Step 1: Ask user for input file ===
    Tk().withdraw()  # hide main Tk window
    input_file = filedialog.askopenfilename(
        title="Select Input XLSB File",
        filetypes=[("Excel Binary Workbook", "*.xlsb")]
    )
    if not input_file:
        print("❌ No file selected.")
        return

    # === Step 2: Prepare output path ===
    os.makedirs(output_folder, exist_ok=True)
    base_name = os.path.basename(input_file)
    name, ext = os.path.splitext(base_name)
    temp_xlsx = os.path.join(output_folder, f"{name}_temp.xlsx")   # work in .xlsx
    output_xlsb = os.path.join(output_folder, f"{name}_output.xlsb")

    # === Step 3: Copy input file to temp .xlsx ===
    # We must convert to .xlsx because pandas/openpyxl cannot write .xlsb
    print("📂 Reading input .xlsb ...")
    with pd.ExcelFile(input_file, engine="pyxlsb") as xls:
        sheet_names = xls.sheet_names
        with pd.ExcelWriter(temp_xlsx, engine="openpyxl") as writer:
            for sheet in sheet_names:
                df = pd.read_excel(xls, sheet_name=sheet, engine="pyxlsb")
                df.to_excel(writer, sheet_name=sheet, index=False)

    # === Step 4: Update/replace sheets ===
    with pd.ExcelWriter(temp_xlsx, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
        for sheet_name, df in dfs_to_update.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    # === Step 5: Apply formatting ===
    book = load_workbook(temp_xlsx)
    header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")
    header_font = Font(bold=True)
    for sheet_name in dfs_to_update.keys():
        if sheet_name in book.sheetnames:
            ws = book[sheet_name]
            for cell in ws[1]:
                cell.font = header_font
                cell.fill = header_fill
            for col in ws.columns:
                col_letter = col[0].column_letter
                ws.column_dimensions[col_letter].width = 25
            ws.auto_filter.ref = ws.dimensions
    book.save(temp_xlsx)

    # === Step 6: Use Excel COM to save as .xlsb ===
    print("💾 Converting to .xlsb ...")
    excel = win32.gencache.EnsureDispatch("Excel.Application")
    excel.Visible = False
    wb = excel.Workbooks.Open(os.path.abspath(temp_xlsx))
    wb.SaveAs(os.path.abspath(output_xlsb), FileFormat=50)  # 50 = xlsb
    wb.Close(SaveChanges=True)
    excel.Quit()

    # Cleanup temp file
    os.remove(temp_xlsx)

    print(f"✅ Done! Saved to: {output_xlsb}")


# === Example usage ===
if __name__ == "__main__":
    dfs_to_update = {
        "UpdatedSheet1": pd.DataFrame({"A": [1, 2], "B": [3, 4]}),
        "NewReport": pd.DataFrame({"X": ["hello", "world"]})
    }
    update_excel_with_xlsb(dfs_to_update)


In [None]:
import win32com.client as win32

def save_dfs_to_xlsb(input_file, dfs_to_update, output_file=None):
    """
    Replace/add multiple sheets in an .xlsb workbook.
    dfs_to_update = { "SheetName1": df1, "SheetName2": df2 }
    """
    excel = win32.gencache.EnsureDispatch("Excel.Application")
    excel.Visible = False

    # Open input file
    wb = excel.Workbooks.Open(input_file)

    for sheet_name, df in dfs_to_update.items():
        # Delete old sheet if exists
        try:
            wb.Sheets(sheet_name).Delete()
        except Exception:
            pass

        # Add new sheet
        ws = wb.Sheets.Add()
        ws.Name = sheet_name

        # Write header
        for i, col in enumerate(df.columns, start=1):
            ws.Cells(1, i).Value = col

        # Write data
        for r in range(len(df)):
            for c in range(len(df.columns)):
                ws.Cells(r+2, c+1).Value = df.iat[r, c]

    # Save as (new copy or overwrite)
    if not output_file:
        output_file = input_file
    wb.SaveAs(output_file, FileFormat=50)  # 50 = xlsb
    wb.Close(SaveChanges=True)
    excel.Quit()


In [None]:
import pandas as pd
import os
import tkinter as tk
from tkinter import filedialog
import shutil

# Get the selected Excel file path
root = tk.Tk()
root.withdraw()

file_path = filedialog.askopenfilename(title="Select Excel File", filetypes=[("Excel Files", "*.xlsx *.xls")])

# Create a copy of the Excel file
file_name = os.path.basename(file_path)
file_name_without_ext = os.path.splitext(file_name)[0]
output_file_name = f"{file_name_without_ext}_output.xlsx"
output_file_path = os.path.join("Output", output_file_name)

output_folder = "Output"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

shutil.copyfile(file_path, output_file_path)

# Process sheets and write to Excel file
dfs = {
    "Sheet1": pd.DataFrame(...),
    "Sheet2": pd.DataFrame(...),
}

with pd.ExcelWriter(output_file_path, mode="a", if_sheet_exists="replace") as writer:
    for sheet_name, df in dfs.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)



In [None]:
import os
import shutil
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font, PatternFill

def update_excel_with_copy(input_file, dfs_to_update, output_folder="Output"):
    """
    Copy input_file -> output_folder/<name>_output.xlsx,
    then replace/add sheets from dfs_to_update (dict: sheet_name -> DataFrame),
    apply header formatting, set column widths, apply autofilter, freeze header.
    """
    # 1) prepare output path
    os.makedirs(output_folder, exist_ok=True)
    base_name = os.path.basename(input_file)
    name, ext = os.path.splitext(base_name)
    output_file = os.path.join(output_folder, f"{name}_output{ext}")

    # 2) copy input -> output (overwrites existing copy)
    shutil.copy2(input_file, output_file)

    # 3) load workbook with openpyxl
    wb = load_workbook(output_file)

    # 4) for each sheet to update: remove if exists, then create and write DF
    header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")  # light blue
    header_font = Font(bold=True)

    for sheet_name, df in dfs_to_update.items():
        # remove existing sheet if present
        if sheet_name in wb.sheetnames:
            ws_old = wb[sheet_name]
            wb.remove(ws_old)

        # create sheet at the end
        ws = wb.create_sheet(title=sheet_name)

        # write dataframe rows (includes header)
        for r in dataframe_to_rows(df, index=False, header=True):
            ws.append(r)

        # formatting header row (row 1)
        # (we assume header exists since header=True above)
        for cell in ws[1]:
            cell.font = header_font
            cell.fill = header_fill

        # set all column widths to 25
        for col_cells in ws.columns:
            if not col_cells:
                continue
            col_letter = col_cells[0].column_letter
            ws.column_dimensions[col_letter].width = 25

        # add autofilter for used range and freeze header row
        max_col = ws.max_column
        max_row = ws.max_row
        if max_col >= 1 and max_row >= 1:
            last_col_letter = ws.cell(row=1, column=max_col).column_letter
            ws.auto_filter.ref = f"A1:{last_col_letter}{max_row}"
            ws.freeze_panes = "A2"

    # 5) save
    wb.save(output_file)
    wb.close()
    return output_file


In [None]:
import os
import shutil
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill

def update_excel_with_copy(input_file, dfs_to_update, output_folder="Output"):
    """
    Create a copy of the input Excel file in the Output folder with '_output' added to its name.
    Replace existing sheets with DataFrames or add new ones if they don't exist.
    Apply formatting (header bold, blue fill, column width, filters).
    """
    # 1. Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # 2. Extract filename and create output path
    base_name = os.path.basename(input_file)
    name, ext = os.path.splitext(base_name)
    output_file = os.path.join(output_folder, f"{name}_output{ext}")

    # 3. Copy the file
    shutil.copy2(input_file, output_file)

    # 4. Load workbook
    book = load_workbook(output_file)

    # 5. Remove sheets that need replacing
    for sheet_name in dfs_to_update.keys():
        if sheet_name in book.sheetnames:
            del book[sheet_name]

    # 6. Write updated/new DataFrames
    with pd.ExcelWriter(output_file, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
        writer._book = book   # internal hook, safe in practice
        for sheet_name, df in dfs_to_update.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)

    # 7. Re-open to apply formatting
    book = load_workbook(output_file)
    header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")  # Light Blue
    header_font = Font(bold=True)

    for sheet_name in dfs_to_update.keys():
        ws = book[sheet_name]

        # Apply header formatting
        for cell in ws[1]:
            cell.font = header_font
            cell.fill = header_fill

        # Set column width = 25
        for col in ws.columns:
            col_letter = col[0].column_letter
            ws.column_dimensions[col_letter].width = 25

        # Add autofilter
        ws.auto_filter.ref = ws.dimensions

    # Save final
    book.save(output_file)
    print(f"✅ File saved with formatting to: {output_file}")


In [None]:
import os
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill

# === 1. Input/Output Paths ===
input_file = r"Input\myfile.xlsx"   # user’s selected file
output_folder = "Output"
os.makedirs(output_folder, exist_ok=True)

output_file = os.path.join(output_folder, os.path.basename(input_file))

# === 2. Read / Process Data ===
# Example: read two sheets, modify them, create new df
df1 = pd.read_excel(input_file, sheet_name="Sheet1")
df2 = pd.read_excel(input_file, sheet_name="Sheet2")

# Process / create new dfs (example only)
df1["NewCol"] = "Processed"
df2["NewCol"] = "Updated"
df_new = pd.DataFrame({"A": [1, 2], "B": [3, 4]})

# === 3. Write Back to Output File ===
if not os.path.exists(output_file):
    mode = "w"
    if_sheet_exists = None
else:
    mode = "a"
    if_sheet_exists = "replace"

with pd.ExcelWriter(output_file, engine="openpyxl", mode=mode, if_sheet_exists=if_sheet_exists) as writer:
    df1.to_excel(writer, sheet_name="Sheet1", index=False)   # replace old
    df2.to_excel(writer, sheet_name="Sheet2", index=False)   # replace old
    df_new.to_excel(writer, sheet_name="New_Sheet", index=False)  # new sheet

# === 4. Formatting Headers, Width, Filters ===
wb = load_workbook(output_file)

for sheet_name in wb.sheetnames:
    ws = wb[sheet_name]

    # Header formatting
    header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")
    for cell in ws[1]:
        cell.font = Font(bold=True)
        cell.fill = header_fill

    # Column width = 25
    for col in ws.columns:
        col_letter = col[0].column_letter
        ws.column_dimensions[col_letter].width = 25

    # Autofilter
    max_col = ws.max_column
    max_row = ws.max_row
    ws.auto_filter.ref = f"A1:{ws.cell(row=1, column=max_col).column_letter}{max_row}"

    # Freeze header row
    ws.freeze_panes = "A2"

# Save back
wb.save(output_file)


In [None]:
# --- Prepare billed CCs from base_df_updated ---
billed_cc_map = (
    base_df_updated.loc[
        base_df_updated['Category for Static'].astype(str).str.lower() == 'billed',
        ['Cost center']
    ]
    .assign(Cost_center_str=lambda d: d['Cost center'].astype(str))  # helper string col
)

# --- Prepare gsc_df with helper string col for matching ---
gsc_df['_CostCentre_str'] = gsc_df['Cost Centre'].astype(str)

# --- Prepare cognos_df with helper string col ---
cognos_df['_CostCode_str'] = cognos_df['Cost Code'].astype(str)

# --- Lookup step 1: mark whether Cost Centre is in billed set ---
billed_set = set(billed_cc_map['Cost_center_str'])
gsc_df['Lookup for Billed CC'] = gsc_df['_CostCentre_str'].apply(
    lambda cc: gsc_df.loc[gsc_df['_CostCentre_str'] == cc, 'Cost Centre'].iloc[0]
    if cc in billed_set else "N/A"
)

# --- Lookup step 2: fetch Manager Ops from Cognos using string keys ---
manager_ops_map = dict(zip(cognos_df['_CostCode_str'], cognos_df['Manager Operations']))
gsc_df['Manager Ops'] = gsc_df['_CostCentre_str'].map(manager_ops_map).fillna("N/A")

# --- Cleanup helper cols ---
gsc_df.drop(columns=['_CostCentre_str'], inplace=True)
cognos_df.drop(columns=['_CostCode_str'], inplace=True)


In [None]:
# === STEP 6: Always add a separator column before new block ===
sep_base = "-"
sep_name = sep_base
suffix = 1
while sep_name in base_df_updated.columns:
    sep_name = f"{sep_base}_set{suffix}"
    suffix += 1

base_df_updated.insert(len(base_df_updated.columns), sep_name, pd.NA)


In [9]:
import pandas as pd

def add_lookup_block(df):
    # The new block of columns (with same headers every time)
    new_block = pd.DataFrame({
        "-": [pd.NA] * len(df),
        "DEP_CODE": [pd.NA] * len(df),
        "AID (Billed)": [pd.NA] * len(df),
        "EXCLUDED": [pd.NA] * len(df),
        "EXCLUDED_REASON": [pd.NA] * len(df),
        "Check_amd": [pd.NA] * len(df),
        "Con_amd": [pd.NA] * len(df),
        "Observations_amd": [pd.NA] * len(df),
    })
    a
    
    # Concat ensures block is appended at the end
    return pd.concat([df, new_block], axis=1)

# # Example base df
# base_df = pd.DataFrame({
#     "PSID": [101, 102],
#     "Name": ["A", "B"]
# })

# First run
# df1 = add_lookup_block(base_df)
# Second run
df1 = pd.read_excel("output.xlsx")
df2 = add_lookup_block(df1)

df2.to_excel("output.xlsx", index=False)


In [None]:
import pandas as pd

# === STEP 1: Load the Employee Allocation file ===
alloc_file = "Employee Allocation file.xlsx"
df = pd.read_excel(alloc_file, sheet_name="Employee Allocation", dtype={"EMPLOYEE_ID": str})

# === STEP 2: Select ENTITY values [EDPM, HDPG, HDPI, HSEP] ===
gfc_df = df[df["ENTITY"].isin(["EDPM", "HDPG", "HDPI", "HSEP"])].copy()

# === STEP 3: Save this subset into sheet "GFC" later ===
# (We will export at the end)

# === STEP 4: Separate "Data Correlation – Request Amendment" records ===
amend_df = gfc_df[gfc_df["EXCLUDED_REASON"] == "Data Correlation – Request Amendment"].copy()

# === STEP 5: Delete these rows from gfc_df ===
gfc_df = gfc_df[gfc_df["EXCLUDED_REASON"] != "Data Correlation – Request Amendment"].copy()

# === STEP 6: Lookup from gfc_df into base_df_updated ===
# (Assuming base_df_updated already exists from Part 1)
lookup_cols = ["DEP_CODE", "AID (Billed)", "EXCLUDED", "EXCLUDED_REASON"]

# VLOOKUP style → keep only first match per EMPLOYEE_ID
gfc_unique = gfc_df.drop_duplicates(subset=["EMPLOYEE_ID"], keep="first")

# ---- NEW LOGIC: create a fresh block of columns each run ----
# Count how many lookup blocks already exist (by counting "DEP_CODE" columns)
block_count = sum(col.startswith("DEP_CODE") for col in base_df_updated.columns) + 1

# Add suffix to distinguish this block
suffix = f"_set{block_count}"

# Add lookup columns for this block
for col in lookup_cols:
    new_col = col + suffix
    base_df_updated[new_col] = base_df_updated["PSID"].map(
        gfc_unique.set_index("EMPLOYEE_ID")[col]
    )

# === STEP 7: Add derived columns ===
base_df_updated["Con_amd" + suffix] = (
    base_df_updated["DEP_CODE" + suffix].fillna("") + base_df_updated["AID (Billed)" + suffix].fillna("")
)
base_df_updated["Con_Billing" + suffix] = (
    base_df_updated["Department ID"].fillna("") + base_df_updated["AID"].fillna("")
)

def check_amd(row):
    if not row["Con_amd" + suffix] or not row["Con_Billing" + suffix]:
        return pd.NA
    return row["Con_amd" + suffix] == row["Con_Billing" + suffix]

base_df_updated["Check_amd" + suffix] = base_df_updated.apply(check_amd, axis=1)

# Blank column for observations
base_df_updated["Observations_amd" + suffix] = ""

# === STEP 8: Apply logic for Observations_amd ===
mask1 = base_df_updated["Check_amd" + suffix].isna() & base_df_updated["Con_amd" + suffix].eq("")
base_df_updated.loc[mask1, "Observations_amd" + suffix] = "Resource left HSBC"

mask2 = (base_df_updated["Category for Static"] == "Billed") & (base_df_updated["EXCLUDED" + suffix] == 0)
base_df_updated.loc[mask2, "Observations_amd" + suffix] = "Ok"

mask3 = base_df_updated["Category for Static"].isin(["Not Billed", "Others"]) & (base_df_updated["EXCLUDED" + suffix] == 1)
base_df_updated.loc[mask3, "Observations_amd" + suffix] = "Ok"

# === STEP 9: Export result ===
with pd.ExcelWriter("Phase3_Part2_Output.xlsx", engine="openpyxl") as writer:
    base_df_updated.to_excel(writer, sheet_name="Base Updated", index=False)
    gfc_df.to_excel(writer, sheet_name="GFC", index=False)
    amend_df.to_excel(writer, sheet_name="Data Correlation – Amendments", index=False)


In [1]:
import pandas as pd

# === STEP 1: Load the Employee Allocation file ===
alloc_file = "Employee Allocation file.xlsx"
df = pd.read_excel(alloc_file, sheet_name="Employee Allocation", dtype={"EMPLOYEE_ID": str})

# === STEP 2: Select ENTITY values [EDPM, HDPG, HDPI, HSEP] ===
gfc_df = df[df["ENTITY"].isin(["EDPM", "HDPG", "HDPI", "HSEP"])].copy()

# === STEP 3: Save this subset into sheet "GFC" later ===
# (We will export at the end)

# === STEP 4: Separate "Data Correlation – Request Amendment" records ===
amend_df = gfc_df[gfc_df["EXCLUDED_REASON"] == "Data Correlation – Request Amendment"].copy()

# === STEP 5: Delete these rows from gfc_df ===
gfc_df = gfc_df[gfc_df["EXCLUDED_REASON"] != "Data Correlation – Request Amendment"].copy()

# === STEP 6: Lookup from gfc_df into base_df_updated ===
# (Assuming base_df_updated already exists from Part 1)
lookup_cols = ["DEP_CODE", "AID (Billed)", "EXCLUDED", "EXCLUDED_REASON"]

# VLOOKUP style → keep only first match per EMPLOYEE_ID
gfc_unique = gfc_df.drop_duplicates(subset=["EMPLOYEE_ID"], keep="first")

for col in lookup_cols:
    base_df_updated[col] = base_df_updated["PSID"].map(
        gfc_unique.set_index("EMPLOYEE_ID")[col]
    )

# === STEP 7: Add derived columns ===
base_df_updated["Con_amd"] = (
    base_df_updated["DEP_CODE"].fillna("") + base_df_updated["AID (Billed)"].fillna("")
)
base_df_updated["Con_Billing"] = (
    base_df_updated["Department ID"].fillna("") + base_df_updated["AID"].fillna("")
)

def check_amd(row):
    if not row["Con_amd"] or not row["Con_Billing"]:
        return pd.NA
    return row["Con_amd"] == row["Con_Billing"]

base_df_updated["Check_amd"] = base_df_updated.apply(check_amd, axis=1)

# Blank column for observations
base_df_updated["Observations_amd"] = ""

# === STEP 8: Apply logic for Observations_amd ===
# Case 1: Check_amd = NA and Con_amd = NA or blank → Resource left HSBC
mask1 = base_df_updated["Check_amd"].isna() & base_df_updated["Con_amd"].eq("")
base_df_updated.loc[mask1, "Observations_amd"] = "Resource left HSBC"

# Case 2: Category for Static = Billed and EXCLUDED = 0 → Ok
mask2 = (base_df_updated["Category for Static"] == "Billed") & (base_df_updated["EXCLUDED"] == 0)
base_df_updated.loc[mask2, "Observations_amd"] = "Ok"

# Case 3: Category for Static in [Not Billed, Others] and EXCLUDED = 1 → Ok
mask3 = base_df_updated["Category for Static"].isin(["Not Billed", "Others"]) & (base_df_updated["EXCLUDED"] == 1)
base_df_updated.loc[mask3, "Observations_amd"] = "Ok"

# === STEP 9: Export result ===
with pd.ExcelWriter("Phase3_Part2_Output.xlsx", engine="openpyxl") as writer:
    base_df_updated.to_excel(writer, sheet_name="Base Updated", index=False)
    gfc_df.to_excel(writer, sheet_name="GFC", index=False)
    amend_df.to_excel(writer, sheet_name="Data Correlation – Amendments", index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'Employee Allocation file.xlsx'

In [2]:
import os
import tkinter as tk
from tkinter import filedialog
import win32com.client as win32

# Prompt for folder selection
root = tk.Tk()
root.withdraw()
folder_path = filedialog.askdirectory(title="Select Folder with Excel Files")

# Initialize Excel
excel = win32.gencache.EnsureDispatch('Excel.Application')
excel.Visible = False  # Set to True if you want to see Excel working

# Loop through Excel files
for filename in os.listdir(folder_path):
    if filename.endswith(".xlsx") or filename.endswith(".xlsm"):
        file_path = os.path.join(folder_path, filename)
        print(f"Processing: {filename}")
        try:
            wb = excel.Workbooks.Open(file_path)

            sheet_count = wb.Sheets.Count
            if sheet_count >= 3:
                wb.Sheets(3).Activate()  # Activate 3rd sheet
                wb.Sheets(2).Activate()  # Then activate 2nd sheet

            wb.Save()
            wb.Close(SaveChanges=True)

        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Quit Excel
excel.Quit()
print("Done.")


Processing: output1.xlsx
Processing: output2.xlsx
Processing: output3.xlsx
Done.


In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, ROUND_HALF_UP

# 1. Excel-style rounding function
def round_half_up(series: pd.Series, decimals: int = 2) -> pd.Series:
    return series.apply(lambda x: float(Decimal(str(x)).quantize(Decimal('1.' + '0'*decimals), rounding=ROUND_HALF_UP)))

# 2. Step 1: Filter "Billed" and calculate % without rounding
def calculate_fte_summary(df: pd.DataFrame) -> pd.DataFrame:
    billed_df = df[df['Category for Static'].str.lower() == 'billed'].copy()
    total_fte_by_aid = billed_df.groupby('AID')['No. of FTE'].sum()
    billed_df['Total FTE'] = billed_df['AID'].map(total_fte_by_aid)
    billed_df['Sum of %'] = (billed_df['No. of FTE'] / billed_df['Total FTE']) * 100
    return billed_df

# 3. Step 2: Create pivot table
def create_fte_pivot(billed_df: pd.DataFrame) -> pd.DataFrame:
    billed_df.fillna("Missing", inplace=True)

    group_cols = [
        'AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
        'Billing Contact PS ID', 'BP CC1', 'BP CC2'
    ]

    summary_df = billed_df.groupby(group_cols, as_index=False).agg({
        'No. of FTE': 'sum',
        'Sum of %': 'sum'
    })

    summary_df.rename(columns={'No. of FTE': 'Sum of No. of FTE'}, inplace=True)
    summary_df['Sum of % (unrounded)'] = summary_df['Sum of %']
    summary_df['Sum of % (rounded)'] = round_half_up(summary_df['Sum of %'])

    # Replace 'Missing' back to blanks for output
    summary_df['BP CC1'] = summary_df['BP CC1'].replace("Missing", "")

    return summary_df

# 4. Step 3: Adjustment function to ensure total = 100
def adjust_sum_of_percent(summary_df: pd.DataFrame) -> pd.DataFrame:
    summary_df = summary_df.copy()
    summary_df['Final Sum of %'] = summary_df['Sum of % (rounded)']  # start with rounded

    adjusted_rows = []
    for aid, group in summary_df.groupby('AID'):
        rounded_sum = group['Sum of % (rounded)'].sum()
        delta = round(100.00 - rounded_sum, 2)

        # Apply delta to first row only
        idx_to_adjust = group.index[0]
        summary_df.at[idx_to_adjust, 'Final Sum of %'] += delta
        summary_df.at[idx_to_adjust, 'Adjustment'] = delta

        # For validation
        summary_df.loc[group.index, 'Sum of % Total after Adjustment'] = \
            summary_df.loc[group.index, 'Final Sum of %'].sum()

    return summary_df

# 5. Step 4: Main runner function
def process_fte_data(df: pd.DataFrame) -> pd.DataFrame:
    billed_df = calculate_fte_summary(df)
    summary_df = create_fte_pivot(billed_df)
    final_df = adjust_sum_of_percent(summary_df)
    return final_df


In [None]:
def adjust_sum_of_percent(summary_df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensures that Sum of % per AID totals exactly 100 by adjusting the last row per AID.
    """
    # Sort to ensure consistent order
    summary_df = summary_df.sort_values(by=['AID']).reset_index(drop=True)

    # Container for adjusted rows
    adjusted_rows = []

    for aid, group in summary_df.groupby('AID'):
        group = group.copy()
        rounded_total = group['Sum of %'].sum()
        diff = round(100.0 - rounded_total, 2)  # small diff due to rounding

        group['Adjustment'] = 0.0

        if abs(diff) > 0.001:
            # Apply adjustment to last row in this group
            last_idx = group.index[-1]
            group.at[last_idx, 'Adjustment'] = diff
            group.at[last_idx, 'Final Sum of %'] = group.at[last_idx, 'Sum of %'] + diff
        else:
            group['Final Sum of %'] = group['Sum of %']

        adjusted_rows.append(group)

    return pd.concat(adjusted_rows).reset_index(drop=True)
summary_df = create_fte_pivot(your_df)
final_df = adjust_sum_of_percent(summary_df)

In [None]:
import pandas as pd
import numpy as np
from decimal import Decimal, ROUND_HALF_UP

def calculate_fte_summary(df: pd.DataFrame) -> pd.DataFrame:
    # Filter only Billed rows (case-insensitive)
    billed_df = df[df['Category for Static'].str.lower() == 'billed'].copy()

    # Calculate Total FTE per AID
    total_fte_by_aid = billed_df.groupby('AID')['No. of FTE'].sum()
    billed_df['Total FTE'] = billed_df['AID'].map(total_fte_by_aid)

    # Calculate Sum of % = (No. of FTE / Total FTE) * 100
    billed_df['Sum of %'] = (billed_df['No. of FTE'] / billed_df['Total FTE']) * 100

    return billed_df

def round_half_up(series: pd.Series, decimals: int = 2) -> pd.Series:
    # Apply Excel-like rounding to a pandas Series
    return series.apply(lambda x: float(Decimal(str(x)).quantize(Decimal('1.' + '0' * decimals), rounding=ROUND_HALF_UP)))

def create_fte_pivot(df: pd.DataFrame) -> pd.DataFrame:
    billed_df = calculate_fte_summary(df)

    # Fill missing values temporarily to avoid dropping during groupby
    billed_df.fillna("Missing", inplace=True)

    group_cols = [
        'AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
        'Billing Contact PS ID', 'BP CC1', 'BP CC2'
    ]

    # Aggregate values
    summary_df = billed_df.groupby(group_cols, as_index=False).agg({
        'No. of FTE': 'sum',
        'Sum of %': 'sum'
    })

    # Rename and apply custom rounding
    summary_df.rename(columns={'No. of FTE': 'Sum of No. of FTE'}, inplace=True)
    summary_df['Sum of % (unrounded)'] = summary_df['Sum of %']
    summary_df['Sum of %'] = round_half_up(summary_df['Sum of %'])

    # Replace 'Missing' in BP CC1 with blank
    summary_df['BP CC1'] = summary_df['BP CC1'].replace("Missing", "")

    return summary_df


In [None]:
import pandas as pd

def calculate_billed_fte_percentage(df: pd.DataFrame) -> pd.DataFrame:
    # Filter only billed rows
    billed_df = df[df['Category for Static'].astype(str).str.strip().str.lower() == 'billed'].copy()

    # Calculate Total FTE per AID
    total_fte_per_aid = billed_df.groupby('AID')['No. of FTE'].sum().rename('Total FTE').reset_index()
    billed_df = billed_df.merge(total_fte_per_aid, on='AID', how='left')

    # Calculate unrounded percentage
    billed_df['Sum of %'] = (billed_df['No. of FTE'] / billed_df['Total FTE']) * 100

    return billed_df

def round_half_up(number, decimals=2):
    factor = 10 ** decimals
    return float(int(number * factor + 0.5)) / factor

def aggregate_fte_summary(billed_df: pd.DataFrame) -> pd.DataFrame:
    group_cols = [
        'AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
        'Business Contact PS ID', 'BP CC1', 'BP CC2'
    ]

    billed_df['Rounded Sum of %'] = billed_df['Sum of %'].apply(lambda x: round_half_up(x, 2))

    summary_df = billed_df.groupby(group_cols, as_index=False).agg({
        'No. of FTE': 'sum',
        'Rounded Sum of %': 'sum'
    })

    summary_df.rename(columns={
        'No. of FTE': 'Sum of No. of FTE',
        'Rounded Sum of %': 'Sum of %'
    }, inplace=True)

    return summary_df

def adjust_percentage_to_100(summary_df: pd.DataFrame) -> pd.DataFrame:
    final_rows = []
    for aid, group in summary_df.groupby('AID'):
        temp_df = group.copy()
        sum_percent = temp_df['Sum of %'].sum()
        diff = round_half_up(100.0 - sum_percent, 2)

        temp_df['Adjusted Sum of %'] = temp_df['Sum of %']
        if not temp_df.empty:
            temp_df.iloc[0, temp_df.columns.get_loc('Adjusted Sum of %')] += diff

        temp_df['% Adjustment Done'] = temp_df['Adjusted Sum of %'] - temp_df['Sum of %']
        final_rows.append(temp_df)

    final_df = pd.concat(final_rows, ignore_index=True)
    return final_df

# Usage Example:
# df = pd.read_excel("your_file.xlsx")
# billed_df = calculate_billed_fte_percentage(df)
# summary_df = aggregate_fte_summary(billed_df)
# final_df = adjust_percentage_to_100(summary_df)
# print(final_df.head())
#option 2################

import pandas as pd

def filter_billed_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the DataFrame to include only rows where 'Category for Static' is 'Billed' (case-insensitive).
    """
    return df[df['Category for Static'].astype(str).str.strip().str.lower() == 'billed'].copy()

def calculate_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds Total FTE per AID and calculates the unrounded percentage (Sum of %) for each row.
    """
    total_fte = df.groupby('AID')['No. of FTE'].sum().rename('Total FTE').reset_index()
    df = df.merge(total_fte, on='AID', how='left')
    df['Sum of % (Unrounded)'] = (df['No. of FTE'] / df['Total FTE']) * 100
    return df

def aggregate_fte_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Groups data by the required columns and calculates sum of No. of FTE and Sum of % (Unrounded).
    """
    group_cols = [
        'AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
        'Business Contact PS ID', 'BP CC1', 'BP CC2'
    ]
    agg_df = df.groupby(group_cols, as_index=False).agg({
        'No. of FTE': 'sum',
        'Sum of % (Unrounded)': 'sum'
    })
    return agg_df

def round_half_up(number: float, decimals: int = 2) -> float:
    """
    Custom rounding function: rounds to the nearest value using round-half-up rule (like Excel).
    """
    factor = 10 ** decimals
    return float(int(number * factor + 0.5)) / factor

def apply_rounding_and_adjustment(df: pd.DataFrame) -> pd.DataFrame:
    """
    Rounds Sum of % and adjusts first row per AID so total becomes exactly 100.
    """
    df['Sum of % (Rounded)'] = df['Sum of % (Unrounded)'].apply(lambda x: round_half_up(x, 2))
    df['Adjusted Sum of %'] = df['Sum of % (Rounded)']  # Copy for final column

    final_rows = []
    for aid, group in df.groupby('AID'):
        group = group.copy()
        total_rounded = group['Sum of % (Rounded)'].sum()
        diff = round_half_up(100.0 - total_rounded, 2)

        if not group.empty:
            idx = group.index[0]
            group.at[idx, 'Adjusted Sum of %'] += diff
            group['% Adjustment Done'] = group['Adjusted Sum of %'] - group['Sum of % (Rounded)']

        final_rows.append(group)

    return pd.concat(final_rows, ignore_index=True)

def process_fte_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Complete pipeline to filter, calculate percentage, aggregate, and adjust percentages.
    """
    billed_df = filter_billed_data(df)
    billed_df = calculate_percentage(billed_df)
    aggregated_df = aggregate_fte_data(billed_df)
    final_df = apply_rounding_and_adjustment(aggregated_df)
    return final_df



In [None]:
import pandas as pd

def enrich_billing_entity_info(main_df: pd.DataFrame, last_month_df: pd.DataFrame, ospd_df: pd.DataFrame) -> pd.DataFrame:
    # Normalize BPCC and LE_Description
    def normalize_bpcc(series):
        return series.astype(str).str.strip().str.lstrip('0').str.upper()

    main_df['__BPCC_tmp'] = normalize_bpcc(main_df['Business Partner Cost Center'])
    last_month_df['__BPCC_tmp'] = normalize_bpcc(last_month_df['Business Partner Cost Center'])
    ospd_df['__BPCC_tmp'] = normalize_bpcc(ospd_df['CC_ID'])

    # Normalize LE_Description
    ospd_df['__LE_tmp'] = ospd_df['LE_Description'].astype(str).str.strip().str.upper()

    # Initialize columns
    if 'Comment' not in main_df.columns:
        main_df['Comment'] = ""
    main_df['Billing Entity name as per OSPD'] = ""

    # Sets for lookup
    bpcc_last_month_set = set(last_month_df['__BPCC_tmp'])
    bpcc_ospd_set = set(ospd_df['__BPCC_tmp'])

    for idx, row in main_df.iterrows():
        bpcc = row['__BPCC_tmp']

        # Skip IPAC, NaN, or empty BPCC
        if pd.isna(bpcc) or bpcc == "" or bpcc == "IPAC":
            continue

        billing_entity = str(row['Billing Entity']).replace("_", " ").strip().upper()

        # Case 1: BPCC not found in OSPD
        if bpcc not in bpcc_ospd_set:
            main_df.at[idx, 'Comment'] = "No BPCC found in OSPD."
            continue

        # Case 2: New BPCC (not in last month file)
        if bpcc not in bpcc_last_month_set:
            matching_ospd_entries = ospd_df[ospd_df['__BPCC_tmp'] == bpcc]['__LE_tmp'].dropna().unique()
            combined_le = " | ".join(sorted(set(matching_ospd_entries)))

            main_df.at[idx, 'Billing Entity name as per OSPD'] = combined_le

            # Compare with cleaned Billing Entity from main_df
            if billing_entity not in matching_ospd_entries:
                comment = f"New BPCC found. Billing Entity mismatch. Expected from OSPD: {combined_le}"
                main_df.at[idx, 'Comment'] = comment

    # Cleanup temporary columns
    main_df.drop(columns=['__BPCC_tmp'], inplace=True)
    return main_df


In [None]:
import pandas as pd

def flag_duplicate_billing_names(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure required columns exist
    name_col = 'Billing Contact Name'
    psid_col = 'Billing Contact PS ID'
    comment_col = 'Comment'

    if name_col not in df.columns or psid_col not in df.columns:
        raise KeyError("Missing required columns in the dataframe.")

    # Ensure 'Comment' column exists
    if comment_col not in df.columns:
        df[comment_col] = ""

    # --------- Case 1: Multiple Names for Same PS ID ---------
    psid_grouped = df.groupby(psid_col)[name_col].unique().reset_index()

    for _, row in psid_grouped.iterrows():
        psid = row[psid_col]
        names = row[name_col]

        if len(names) <= 1:
            continue

        normalized_names = [str(n).strip().lower() for n in names]

        if all(n == normalized_names[0] for n in normalized_names):
            continue  # All names are same after normalization

        name_names = [n for n in names if '@' not in str(n)]
        email_names = [n for n in names if '@' in str(n)]

        is_likely_same_person = False

        for name in name_names:
            name_words = str(name).lower().split()
            for email in email_names:
                email_lower = str(email).lower()
                if any(word in email_lower for word in name_words):
                    is_likely_same_person = True
                    break
            if is_likely_same_person:
                break

        # Decide the final comment
        if is_likely_same_person:
            final_comment = 'Duplicate name against same PS ID, might be same person.'
        else:
            final_comment = 'Same PS ID has unrelated names (likely different people).'

        # Apply comment (replace or append)
        idxs = df[df[psid_col] == psid].index
        for idx in idxs:
            existing = str(df.at[idx, comment_col])
            if final_comment not in existing:
                df.at[idx, comment_col] = final_comment

    # --------- Case 2: Multiple PS IDs for Same Name ---------
    name_grouped = df.groupby(name_col)[psid_col].nunique().reset_index()
    multi_psid_names = name_grouped[name_grouped[psid_col] > 1][name_col]

    for name in multi_psid_names:
        idxs = df[df[name_col] == name].index
        for idx in idxs:
            existing = str(df.at[idx, comment_col])
            if 'Same contact name appears with multiple PS IDs.' not in existing:
                df.at[idx, comment_col] = 'Same contact name appears with multiple PS IDs.'

    return df


In [None]:
import pandas as pd

def flag_duplicate_billing_names(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure required columns exist
    name_col = 'Billing Contact Name'
    psid_col = 'Billing Contact PS ID'
    comment_col = 'Comment'

    if name_col not in df.columns or psid_col not in df.columns:
        raise KeyError("Missing required columns in the dataframe.")

    # Ensure 'Comment' column exists
    if comment_col not in df.columns:
        df[comment_col] = ""

    # --------- Case 1: Multiple Names for Same PS ID ---------
    psid_grouped = df.groupby(psid_col)[name_col].unique().reset_index()

    for _, row in psid_grouped.iterrows():
        psid = row[psid_col]
        names = row[name_col]

        if len(names) <= 1:
            continue  # Only one name, no duplicates

        name_names = [n for n in names if '@' not in str(n)]
        email_names = [n for n in names if '@' in str(n)]

        match_found = False

        for name in name_names:
            name_words = str(name).lower().split()
            for email in email_names:
                email_lower = str(email).lower()
                if any(word in email_lower for word in name_words):
                    match_found = True
                    idxs = df[(df[psid_col] == psid) & (df[name_col].isin([name, email]))].index
                    for idx in idxs:
                        old_comment = str(df.at[idx, comment_col])
                        new_comment = 'Duplicate name against same PS ID, might be same person.'
                        if new_comment not in old_comment:
                            updated_comment = f"{old_comment} | {new_comment}".strip(" |")
                            df.at[idx, comment_col] = updated_comment

        if not match_found:
            # Add comment for all rows with this PS ID
            idxs = df[df[psid_col] == psid].index
            for idx in idxs:
                old_comment = str(df.at[idx, comment_col])
                new_comment = 'Same BC PS ID found against multiple BC names – unlikely to be same person.'
                if new_comment not in old_comment:
                    updated_comment = f"{old_comment} | {new_comment}".strip(" |")
                    df.at[idx, comment_col] = updated_comment

    # --------- Case 2: Multiple PS IDs for Same Name ---------
    name_grouped = df.groupby(name_col)[psid_col].nunique().reset_index()
    multi_psid_names = name_grouped[name_grouped[psid_col] > 1][name_col]

    for name in multi_psid_names:
        idxs = df[df[name_col] == name].index
        for idx in idxs:
            old_comment = str(df.at[idx, comment_col])
            new_comment = 'Same contact name appears with multiple PS IDs.'
            if new_comment not in old_comment:
                updated_comment = f"{old_comment} | {new_comment}".strip(" |")
                df.at[idx, comment_col] = updated_comment

    return df


In [None]:
import pandas as pd

def calculate_fte_summary(df: pd.DataFrame) -> pd.DataFrame:
    # Filter only Billed rows (case-insensitive)
    billed_df = df[df['Category for Static'].str.lower() == 'billed'].copy()

    # Calculate Total FTE per AID
    total_fte_by_aid = billed_df.groupby('AID')['No. of FTE'].sum()
    billed_df['Total FTE'] = billed_df['AID'].map(total_fte_by_aid)

    # Calculate Sum of % = (No. of FTE / Total FTE) * 100
    billed_df['Sum of %'] = (billed_df['No. of FTE'] / billed_df['Total FTE']) * 100

    # Round and adjust % so sum is exactly 100 per AID
    def adjust_percentage(group):
        group['Sum of %'] = group['Sum of %'].round(2)
        total = group['Sum of %'].sum()
        diff = round(100.00 - total, 2)

        if abs(diff) > 0:  # Only adjust if needed
            # Adjust the last row in the group
            group.iloc[-1, group.columns.get_loc('Sum of %')] += diff

        return group

    billed_df = billed_df.groupby('AID', group_keys=False).apply(adjust_percentage)

    return billed_df

def create_fte_pivot(df: pd.DataFrame) -> pd.DataFrame:
    billed_df = calculate_fte_summary(df)

    # Only fill missing in other fields (not BP CC1 or BP CC2)
    columns_to_fill = [
        'AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
        'Billing Contact PS ID'
    ]
    billed_df[columns_to_fill] = billed_df[columns_to_fill].fillna("Missing")

    group_cols = [
        'AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
        'Billing Contact PS ID', 'BP CC1', 'BP CC2'
    ]

    summary_df = billed_df.groupby(group_cols, as_index=False).agg({
        'No. of FTE': 'sum',
        'Sum of %': 'sum'
    })

    summary_df['Sum of %'] = summary_df['Sum of %'].round(2)
    summary_df.rename(columns={'No. of FTE': 'Sum of No. of FTE'}, inplace=True)

    return summary_df


In [None]:
import pandas as pd

def calculate_fte_summary(df: pd.DataFrame) -> pd.DataFrame:
    # Filter only Billed rows (case-insensitive)
    billed_df = df[df['Category for Static'].str.lower() == 'billed'].copy()

    # Calculate Total FTE per AID
    total_fte_by_aid = billed_df.groupby('AID')['No. of FTE'].sum()
    billed_df['Total FTE'] = billed_df['AID'].map(total_fte_by_aid)

    # Calculate Sum of % = (No. of FTE / Total FTE) * 100
    billed_df['Sum of %'] = (billed_df['No. of FTE'] / billed_df['Total FTE']) * 100

    return billed_df

def create_fte_pivot(df: pd.DataFrame) -> pd.DataFrame:
    # Get summary from your function
    billed_df = calculate_fte_summary(df)

    # Just round 'Sum of %' to 2 decimals and rename column
    billed_df['Sum of %'] = billed_df['Sum of %'].round(2)
    billed_df.rename(columns={'No. of FTE': 'Sum of No. of FTE'}, inplace=True)

    # Only keep the required columns (no grouping or aggregation)
    summary_df = billed_df[
        ['AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
         'Billing Contact PS ID', 'BP CC1', 'BP CC2', 'Sum of No. of FTE', 'Sum of %']
    ].copy()

    return summary_df

## or

def create_fte_pivot(df: pd.DataFrame) -> pd.DataFrame:
    billed_df = calculate_fte_summary(df)

    # Fill missing values temporarily to avoid dropping during groupby
    billed_df.fillna("Missing", inplace=True)

    group_cols = [
        'AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
        'Billing Contact PS ID', 'BP CC1', 'BP CC2'
    ]

    summary_df = billed_df.groupby(group_cols, as_index=False).agg({
        'No. of FTE': 'sum',
        'Sum of %': 'sum'
    })

    summary_df['Sum of %'] = summary_df['Sum of %'].round(2)
    summary_df.rename(columns={'No. of FTE': 'Sum of No. of FTE'}, inplace=True)

    return summary_df


In [None]:
import pandas as pd
import numpy as np

def prepare_fte_summary_table(billed_df: pd.DataFrame) -> pd.DataFrame:
    # Rename Total FTE for clarity
    billed_df['Sum of No. of FTE'] = billed_df['Total FTE']
    
    # Round Sum of % to two decimals
    billed_df['Sum of %'] = billed_df['Sum of %'].round(2)

    # Columns required in final output
    cols = ['AID', 'Billing Entity', 'Business Partner Cost Center', 'Currency',
            'Billing Contact PS ID', 'BP CC1', 'BP CC2', 'Sum of No. of FTE', 'Sum of %']
    output_df = billed_df[cols].copy()

    # Add adjustment column initialized as 0.0
    output_df['Adjustment Applied'] = 0.0

    # Group by AID and check % sum
    for aid, group in output_df.groupby('AID'):
        percent_sum = group['Sum of %'].sum().round(2)
        adjustment_needed = round(100.0 - percent_sum, 2)

        if adjustment_needed != 0.0:
            # Apply the adjustment to the first row of this AID
            idx = group.index[0]
            output_df.at[idx, 'Sum of %'] += adjustment_needed
            output_df.at[idx, 'Sum of %'] = round(output_df.at[idx, 'Sum of %'], 2)
            output_df.at[idx, 'Adjustment Applied'] = adjustment_needed

    return output_df


In [None]:

import pandas as pd

def flag_duplicate_billing_names(df: pd.DataFrame) -> pd.DataFrame:
    # Ensure required columns exist
    name_col = 'Billing Contact Name'
    psid_col = 'Billing Contact PS ID'
    comment_col = 'Comment'

    if name_col not in df.columns or psid_col not in df.columns:
        raise KeyError("Missing required columns in the dataframe.")

    # Ensure 'Comment' column exists
    if comment_col not in df.columns:
        df[comment_col] = ""

    # --------- Case 1: Multiple Names for Same PS ID ---------
    psid_grouped = df.groupby(psid_col)[name_col].unique().reset_index()

    for _, row in psid_grouped.iterrows():
        psid = row[psid_col]
        names = row[name_col]

        name_names = [n for n in names if '@' not in str(n)]
        email_names = [n for n in names if '@' in str(n)]

        for name in name_names:
            name_words = str(name).lower().split()
            for email in email_names:
                email_lower = str(email).lower()
                if any(word in email_lower for word in name_words):
                    idxs = df[(df[psid_col] == psid) & (df[name_col].isin([name, email]))].index
                    for idx in idxs:
                        old_comment = str(df.at[idx, comment_col])
                        new_comment = 'Duplicate name against same PS ID, might be same person.'
                        if new_comment not in old_comment:
                            updated_comment = f"{old_comment} | {new_comment}".strip(" |")
                            df.at[idx, comment_col] = updated_comment

    # --------- Case 2: Multiple PS IDs for Same Name ---------
    name_grouped = df.groupby(name_col)[psid_col].nunique().reset_index()
    multi_psid_names = name_grouped[name_grouped[psid_col] > 1][name_col]

    for name in multi_psid_names:
        idxs = df[df[name_col] == name].index
        for idx in idxs:
            old_comment = str(df.at[idx, comment_col])
            new_comment = 'Same contact name appears with multiple PS IDs.'
            if new_comment not in old_comment:
                updated_comment = f"{old_comment} | {new_comment}".strip(" |")
                df.at[idx, comment_col] = updated_comment

    return df


In [None]:
import pandas as pd

def flag_duplicate_billing_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Flags records where a Billing Contact PSID is linked to multiple Billing Contact Names,
    and suggests if the email and name might belong to the same person.
    Also flags records where the same Billing Contact Name is used with multiple PSIDs.
    """

    # Ensure required columns exist
    if 'Billing Contact PSID' not in df.columns or 'Billing Contact name' not in df.columns:
        raise KeyError("Missing required columns in the dataframe.")

    # Ensure 'Comment' column exists
    if 'Comment' not in df.columns:
        df['Comment'] = ""

    # --- Pass 1: Multiple names for same PSID ---
    grouped_by_psid = df.groupby('Billing Contact PSID')['Billing Contact name'].unique().reset_index()

    for _, row in grouped_by_psid.iterrows():
        psid = row['Billing Contact PSID']
        names = row['Billing Contact name']

        name_names = [n for n in names if '@' not in str(n)]
        email_names = [n for n in names if '@' in str(n)]

        for name in name_names:
            name_words = str(name).lower().split()
            for email in email_names:
                email_lower = str(email).lower()
                if any(word in email_lower for word in name_words):
                    idxs = df[(df['Billing Contact PSID'] == psid) & 
                              (df['Billing Contact name'].isin([name, email]))].index
                    for idx in idxs:
                        old_comment = str(df.at[idx, 'Comment'])
                        new_comment = 'Duplicate name against same PSID, might be same person.'
                        if new_comment not in old_comment:
                            updated_comment = f"{old_comment} | {new_comment}".strip(" |")
                            df.at[idx, 'Comment'] = updated_comment

    # --- Pass 2: Same name for multiple PSIDs ---
    grouped_by_name = df.groupby('Billing Contact name')['Billing Contact PSID'].nunique().reset_index()

    for _, row in grouped_by_name.iterrows():
        name = row['Billing Contact name']
        psid_count = row['Billing Contact PSID']
        if psid_count > 1:
            idxs = df[df['Billing Contact name'] == name].index
            for idx in idxs:
                old_comment = str(df.at[idx, 'Comment'])
                new_comment = 'Same contact name appears against multiple PSIDs.'
                if new_comment not in old_comment:
                    updated_comment = f"{old_comment} | {new_comment}".strip(" |")
                    df.at[idx, 'Comment'] = updated_comment

    return df


In [None]:
import pandas as pd

def normalize_billing_entities(main_df, last_month_df):
    # Create working copies
    df = main_df.copy()
    last_df = last_month_df.copy()

    # Standardize column names
    bpcc_col = 'Business Partner Cost Center'
    billing_col = 'Billing Entity'

    # Add helper lowercase columns for internal logic
    df['bpcc_lower'] = df[bpcc_col].astype(str).str.strip().str.lower()
    df['billing_lower'] = df[billing_col].astype(str).str.strip().str.lower()
    last_df['bpcc_lower'] = last_df[bpcc_col].astype(str).str.strip().str.lower()
    last_df['billing_lower'] = last_df[billing_col].astype(str).str.strip().str.lower()

    # Create mapping of last month's BPCC -> billing entity
    last_month_map = last_df.drop_duplicates('bpcc_lower').set_index('bpcc_lower')['billing_lower'].to_dict()
    last_month_orig_case_map = last_df.drop_duplicates('bpcc_lower').set_index('bpcc_lower')[billing_col].to_dict()

    # Create result columns
    df['Original Billing Entity'] = df[billing_col]
    df['Comments'] = ""

    # Step 1: Normalize billing entity values based on last month's file
    grouped = df.groupby('bpcc_lower')['billing_lower'].unique()

    for bpcc, billing_list in grouped.items():
        if len(billing_list) > 1 and bpcc in last_month_map:
            correct = last_month_map[bpcc]
            correct_case = last_month_orig_case_map[bpcc]

            for billing in billing_list:
                if billing != correct:
                    # Check if correct is a substring
                    if correct in billing or billing in correct:
                        # Replace in df
                        mask = (df['bpcc_lower'] == bpcc) & (df['billing_lower'] == billing)
                        df.loc[mask, billing_col] = correct_case
                        df.loc[mask, 'Comments'] = (
                            "Billing Entity replaced based on last month file. Original: " +
                            df.loc[mask, 'Original Billing Entity']
                        )
                    else:
                        # Mark as multiple conflicting entities
                        mask = (df['bpcc_lower'] == bpcc) & (df['billing_lower'] == billing)
                        df.loc[mask, 'Comments'] = "Multiple billing entities found (not matched to last month value)"

    # Step 2: Check for new combinations (bpcc + billing entity)
    df['bpcc_billing_combo'] = df['bpcc_lower'] + '|' + df[billing_col].astype(str).str.strip().str.lower()
    last_df['bpcc_billing_combo'] = last_df['bpcc_lower'] + '|' + last_df[billing_col].astype(str).str.strip().str.lower()

    last_combos = set(last_df['bpcc_billing_combo'])

    # Only check BPCCs not starting with '9'
    mask_non9 = ~df[bpcc_col].astype(str).str.strip().str.startswith('9')
    mask_new_combo = ~df['bpcc_billing_combo'].isin(last_combos)

    df.loc[mask_non9 & mask_new_combo, 'Comments'] = df.loc[mask_non9 & mask_new_combo, 'Comments'].apply(
        lambda x: (x + " | " if x else "") + "New billing cc found"
    )

    # Clean up helper columns
    df.drop(columns=['bpcc_lower', 'billing_lower', 'bpcc_billing_combo'], inplace=True)

    return df


In [None]:
def flag_new_bpce_combinations(main_df, last_month_df):
    # Clean for matching
    def clean_for_key(series):
        return series.astyp`e(str).str.strip().str.lstrip('0')

    # Create temp cleaned BPCC keys
    main_df['Temp_BPCC'] = clean_for_key(main_df['Business Partner Cost Center'])
    last_month_df['Temp_BPCC'] = clean_for_key(last_month_df['Business Partner Cost Center'])

    # Create lookup dictionary from last_month_df
    bpcc_billing_map = last_month_df.dropna(subset=['Billing Entity']).drop_duplicates('Temp_BPCC').set_index('Temp_BPCC')['Billing Entity'].to_dict()

    # Track old values and apply update
    if 'Comment' not in main_df.columns:
        main_df['Comment'] = ''

    updated_rows = main_df['Temp_BPCC'].isin(bpcc_billing_map.keys())
    main_df.loc[updated_rows, 'Old Billing Entity'] = main_df.loc[updated_rows, 'Billing Entity']
    main_df.loc[updated_rows, 'Billing Entity'] = main_df.loc[updated_rows, 'Temp_BPCC'].map(bpcc_billing_map)
    main_df.loc[updated_rows, 'Comment'] += main_df.loc[updated_rows].apply(
        lambda row: f"Billing Entity updated from '{row['Old Billing Entity']}' to '{row['Billing Entity']}'", axis=1
    )

    # Create key after update
    main_df['BPCC_Billing_Key'] = main_df['Temp_BPCC'] + main_df['Billing Entity'].astype(str).str.strip()
    last_month_df['BPCC_Billing_Key'] = last_month_df['Temp_BPCC'] + last_month_df['Billing Entity'].astype(str).str.strip()

    unmatched_mask = ~main_df['BPCC_Billing_Key'].isin(last_month_df['BPCC_Billing_Key'])

    # Add comment for unmatched
    comment_text = 'Business Partner Cost Center & Billing Entity not found in last month file'
    main_df.loc[unmatched_mask & main_df['Comment'].eq(''), 'Comment'] = comment_text
    main_df.loc[unmatched_mask & main_df['Comment'].ne('') & ~main_df['Comment'].str.contains(comment_text, case=False), 'Comment'] += ' | ' + comment_text

    # Clean up
    main_df.drop(columns=['Temp_BPCC', 'BPCC_Billing_Key', 'Old Billing Entity'], errors='ignore', inplace=True)
    last_month_df.drop(columns=['Temp_BPCC', 'BPCC_Billing_Key'], inplace=True)

    return main_df


In [1]:
import pandas as pd
import numpy as np

# Example DataFrame
data = {
    'BP ccenter': [' 00123 ', ' 0456', np.nan, '000789 ', ' abc123 ', None, ' 0000 ']
}

df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Trim spaces but keep leading zeros and NaNs as NaN
df['BP ccenter'] = df['BP ccenter'].apply(lambda x: str(x).strip() if pd.notnull(x) else x)

print("\nAfter trimming spaces (leading zeros retained, NaNs intact):")
print(df)


Original Data:
  BP ccenter
0     00123 
1       0456
2        NaN
3    000789 
4    abc123 
5       None
6      0000 

After trimming spaces (leading zeros retained, NaNs intact):
  BP ccenter
0      00123
1       0456
2        NaN
3     000789
4     abc123
5       None
6       0000


In [None]:
def detect_billing_entity_mismatches(main_df):
    # Step 1: Create cleaned versions from both columns
    def clean_entity(entity):
        entity = str(entity).strip()
        return entity.rsplit('_', 1)[-1].strip().lower()
    
    main_df['Billing Entity (cleaned)'] = main_df['Billing Entity'].apply(clean_entity)
    main_df['Billing Entity from OSPD (cleaned)'] = main_df['Billing Entity name as per OSPD'].apply(clean_entity)

    # Step 2: Compare cleaned columns (case-insensitive)
    mismatch_mask = (
        (main_df['Billing Entity (cleaned)'].notna()) &
        (main_df['Billing Entity from OSPD (cleaned)'].notna()) &
        (main_df['Billing Entity (cleaned)'] != main_df['Billing Entity from OSPD (cleaned)'])
    )

    # Step 3: Add a comment where mismatches occur
    main_df['Billing Entity Mismatch Comment'] = ''
    main_df.loc[mismatch_mask, 'Billing Entity Mismatch Comment'] = (
        'Mismatch after cleaning: '
        'Billing Entity = "' + main_df['Billing Entity'] + '", '
        'OSPD = "' + main_df['Billing Entity name as per OSPD'] + '"'
    )

    return main_df


In [None]:
def add_currency_column(df):
    """
    Adds a 'Currency' column based on GFOC Country and Country columns.
    If both countries match and the country is in predefined list, assign local currency.
    Otherwise, assign 'USD'.
    """
    # Mapping dictionary
    country_currency_map = {
        "Mexico": "MXN",
        "India": "INR",
        "Poland": "PLN",
        "China": "CNY"
    }

    def determine_currency(row):
        if pd.isna(row['GFOC Country']) or pd.isna(row['Country']):
            return "USD"
        if row['GFOC Country'].strip().lower() == row['Country'].strip().lower():
            return country_currency_map.get(row['Country'].strip(), "USD")
        else:
            return "USD"

    df['Currency'] = df.apply(determine_currency, axis=1)
    return df


In [None]:
def fix_multiple_billing_entities(main_df):
    """
    Detects and resolves mismatches in Billing Entity for the same Business Partner Cost Center.
    - Replaces mismatching entries if substring logic applies.
    - Flags all rows when mismatch cannot be resolved.
    
    Returns:
    - updated_df: DataFrame with corrected Billing Entity and comments
    - unresolved_rows_df: DataFrame with unresolved mismatches for logging or review
    """
    df = main_df.copy()

    # Ensure consistent data types
    df['Business Partner Cost Center'] = df['Business Partner Cost Center'].astype(str)
    df['Billing Entity'] = df['Billing Entity'].astype(str)

    # Create comment column if not present
    comment_col = 'Billing Entity Validation Comment'
    if comment_col not in df.columns:
        df[comment_col] = ''

    # Track unresolved rows
    unresolved_rows = []

    # Identify BPCCs linked to multiple billing entities
    multi_bpccs = df.groupby('Business Partner Cost Center')['Billing Entity'].nunique()
    multi_bpccs = multi_bpccs[multi_bpccs > 1].index

    for bpcc in multi_bpccs:
        subset = df[df['Business Partner Cost Center'] == bpcc]
        entities = subset['Billing Entity'].str.lower().unique()
        entity_counts = subset['Billing Entity'].str.lower().value_counts()

        # Try to find a good candidate (underscore & appears multiple times)
        correct_candidates = [e for e in entity_counts.index if '_' in e]
        if correct_candidates:
            correct_entity = correct_candidates[0]
            correct_original = subset[
                subset['Billing Entity'].str.lower() == correct_entity
            ]['Billing Entity'].iloc[0]

            mismatch_unresolved = False

            for ent in entities:
                if ent == correct_entity:
                    continue
                if ent in correct_entity:
                    mask = (
                        (df['Business Partner Cost Center'] == bpcc) &
                        (df['Billing Entity'].str.lower() == ent)
                    )
                    old_vals = df.loc[mask, 'Billing Entity']
                    df.loc[mask, 'Billing Entity'] = correct_original
                    df.loc[mask, comment_col] += (
                        "Billing Entity replaced from '" +
                        old_vals + "' to '" + correct_original +
                        "' based on substring match logic."
                    )
                else:
                    mismatch_unresolved = True

            if mismatch_unresolved:
                mask_all = (df['Business Partner Cost Center'] == bpcc)
                df.loc[mask_all, comment_col] += (
                    "Multiple distinct Billing Entities found against same BPCC."
                )
                unresolved_rows.append(df[mask_all])
        else:
            # No good candidate at all
            mask_all = (df['Business Partner Cost Center'] == bpcc)
            df.loc[mask_all, comment_col] += (
                "Multiple distinct Billing Entities found against same BPCC."
            )
            unresolved_rows.append(df[mask_all])

    # Combine all unresolved into one DataFrame
    if unresolved_rows:
        unresolved_rows_df = pd.concat(unresolved_rows, ignore_index=True)
    else:
        unresolved_rows_df = pd.DataFrame(columns=df.columns)

    return df, unresolved_rows_df


final_df, unresolved_df = fix_multiple_billing_entities(main_df)

# Save if needed
with pd.ExcelWriter('output.xlsx', engine='openpyxl', mode='a') as writer:
    unresolved_df.to_excel(writer, sheet_name='Unresolved Billing Entities', index=False)


In [None]:
def fix_multiple_billing_entities(main_df):
    """
    Detects and resolves cases where the same Business Partner Cost Center (BPCC)
    is associated with multiple Billing Entities. If a mismatched entity is found
    to be a substring of the more common one (typically containing an underscore),
    it is replaced and commented.
    """
    # Make a copy for safe manipulation
    df = main_df.copy()
    
    # Ensure string format
    df['Business Partner Cost Center'] = df['Business Partner Cost Center'].astype(str)
    df['Billing Entity'] = df['Billing Entity'].astype(str)

    # Group by BPCC and find those with multiple billing entities
    multi_entity_bpcc = df.groupby('Business Partner Cost Center')['Billing Entity'].nunique()
    multi_entity_bpcc = multi_entity_bpcc[multi_entity_bpcc > 1].index

    # Create comment column if not already present
    comment_col = 'Billing Entity Validation Comment'
    if comment_col not in df.columns:
        df[comment_col] = ''

    for bpcc in multi_entity_bpcc:
        subset = df[df['Business Partner Cost Center'] == bpcc]
        entities = subset['Billing Entity'].str.lower().unique()

        # Count frequencies to find the most frequent entity with underscores
        entity_counts = subset['Billing Entity'].str.lower().value_counts()
        correct_entities = [e for e in entity_counts.index if '_' in e]
        if not correct_entities:
            continue  # No candidate "correct" entries to fix against

        correct_entity = correct_entities[0]  # Most frequent with "_"
        correct_entity_original_case = subset[
            subset['Billing Entity'].str.lower() == correct_entity
        ]['Billing Entity'].iloc[0]

        # Find mismatched entities that are substrings of correct one
        for ent in entities:
            if ent == correct_entity:
                continue
            if ent in correct_entity:
                # Find rows with this entity and fix
                match_mask = (
                    (df['Business Partner Cost Center'] == bpcc) &
                    (df['Billing Entity'].str.lower() == ent)
                )
                old_value = df.loc[match_mask, 'Billing Entity']
                df.loc[match_mask, 'Billing Entity'] = correct_entity_original_case
                df.loc[match_mask, comment_col] += (
                    "Billing Entity replaced from '" +
                    old_value + "' to '" + correct_entity_original_case +
                    "' based on substring match logic."
                )

            else:
                # Otherwise comment as unresolved mismatch
                mismatch_mask = (
                    (df['Business Partner Cost Center'] == bpcc) &
                    (df['Billing Entity'].str.lower() == ent)
                )
                df.loc[mismatch_mask, comment_col] += (
                    "Multiple distinct Billing Entities found against same BPCC."
                )

    return df


In [None]:
def flag_new_bpce_combinations(main_df, last_month_df):
    # Clean for concatenation purpose only (strip, lstrip zeroes)
    def clean_for_key(series):
        return series.astype(str).str.strip().str.lstrip('0')

    # Create temporary cleaned keys for matching
    main_df['Temp_BPCC'] = clean_for_key(main_df['Business Partner Cost Center'])
    last_month_df['Temp_BPCC'] = clean_for_key(last_month_df['Business Partner Cost Center'])

    main_df['BPCC_Billing_Key'] = main_df['Temp_BPCC'] + main_df['Billing Entity'].astype(str).str.strip()
    last_month_df['BPCC_Billing_Key'] = last_month_df['Temp_BPCC'] + last_month_df['Billing Entity'].astype(str).str.strip()

    # Identify unmatched combinations
    unmatched_mask = ~main_df['BPCC_Billing_Key'].isin(last_month_df['BPCC_Billing_Key'])

    # Comment logic
    comment_text = 'Business Partner Cost Center & Billing Entity not found in last month file'
    if 'Comment' not in main_df.columns:
        main_df['Comment'] = ''

    main_df.loc[unmatched_mask & main_df['Comment'].eq(''), 'Comment'] = comment_text
    main_df.loc[unmatched_mask & main_df['Comment'].ne('') & ~main_df['Comment'].str.contains(comment_text, case=False), 'Comment'] += ' | ' + comment_text

    # Drop temporary keys
    main_df.drop(columns=['Temp_BPCC', 'BPCC_Billing_Key'], inplace=True)
    last_month_df.drop(columns=['Temp_BPCC', 'BPCC_Billing_Key'], inplace=True)

    return main_df


In [None]:
def add_currency_column(main_df):
    # Define country-to-currency mapping
    country_currency_map = {
        'MEXICO': 'MXN',
        'INDIA': 'INR',
        'POLAND': 'PLN',
        'CHINA': 'CNY'
    }

    # Standardize and compare country columns
    def determine_currency(row):
        gfoc_country = str(row.get('GFOC Country', '')).strip().upper()
        country = str(row.get('Country', '')).strip().upper()

        if gfoc_country == country:
            return country_currency_map.get(gfoc_country, 'USD')
        else:
            return 'USD'

    # Apply logic to each row
    main_df['Currency'] = main_df.apply(determine_currency, axis=1)
    return main_df
main_df = add_currency_column(main_df)


In [None]:
import re

def detect_duplicate_billing_names(df):
    if 'Comment2' not in df.columns:
        df['Comment2'] = ''
    
    def extract_name_tokens(text):
        if pd.isna(text):
            return set()
        text = str(text).strip()
        if '@' in text:
            text = text.split('@')[0]  # only local part of email
        text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove digits/symbols
        tokens = text.lower().split()
        return set(tokens)

    # Group by PSID
    psid_groups = df.groupby('Billing Contact PSID')

    for psid, group in psid_groups:
        names = group['Billing Contact Name'].dropna().unique()
        if len(names) <= 1:
            continue  # no conflict

        # Try to see if all names seem to refer to same person
        base_tokens = [extract_name_tokens(name) for name in names]

        conflict = False
        for i in range(len(base_tokens)):
            for j in range(i + 1, len(base_tokens)):
                # If intersection is empty => likely different persons
                if base_tokens[i].isdisjoint(base_tokens[j]):
                    conflict = True
                    break
            if conflict:
                break

        if conflict:
            df.loc[group.index, 'Comment2'] = df.loc[group.index, 'Comment2'].replace('', 'Billing contact has duplicate names', regex=False)
            df.loc[group.index, 'Comment2'] = df.loc[group.index, 'Comment2'].apply(lambda x: x if 'Billing contact has duplicate names' in x else f"{x} | Billing contact has duplicate names")

    return df
main_df = detect_duplicate_billing_names(main_df)


In [None]:
def format_billing_contact_psid(df):
    def format_psid(val):
        val_str = str(val).strip()
        if val_str.upper() == 'IPAC':
            return val_str  # Leave as is
        return val_str.zfill(8) if val_str.isdigit() else val_str

    df['Billing Contact PSID'] = df['Billing Contact PSID'].apply(format_psid)
    return df
# Format the column in your main input file (main_df)
main_df = format_billing_contact_psid(main_df)


In [None]:
import pandas as pd

def check_multiple_billing_entities(df):
    # Ensure columns are strings
    df['Business Partner Cost Center'] = df['Business Partner Cost Center'].astype(str)
    df['Billing Entity'] = df['Billing Entity'].astype(str)
    
    # Create a mapping of cost center to unique billing entities
    cc_entity_group = df.groupby('Business Partner Cost Center')['Billing Entity'].nunique().reset_index()
    cc_entity_group = cc_entity_group[cc_entity_group['Billing Entity'] > 1]

    # List of cost centers with multiple billing entities
    conflict_ccs = cc_entity_group['Business Partner Cost Center'].tolist()

    # Initialize 'Comment2' column if 'Comment' already exists
    if 'Comment' in df.columns:
        if 'Comment2' not in df.columns:
            df['Comment2'] = ''

    # Apply comments row-wise
    def comment_logic(row):
        if row['Business Partner Cost Center'] in conflict_ccs:
            if row['Business Partner Cost Center'].startswith('9'):
                return "Billing cost center begins with digit 9."
            else:
                return "Multiple billing entities found."
        return ""

    # Add comments accordingly
    if 'Comment2' in df.columns:
        df['Comment2'] = df.apply(lambda row: comment_logic(row), axis=1)
    else:
        df['Comment'] = df.apply(
            lambda row: row['Comment'] + " | " + comment_logic(row) if row['Business Partner Cost Center'] in conflict_ccs and row['Comment'] else comment_logic(row),
            axis=1
        )
    
    return df
main_df = check_multiple_billing_entities(main_df)

In [3]:
def add_billing_entity_name_from_ospd(main_df, ospd_df):
    # Ensure both keys are strings for consistent merge
    main_df['Business Partner Cost Center'] = main_df['Business Partner Cost Center'].astype(str).str.lstrip('0').str.upper()
    ospd_df['CC_ID'] = ospd_df['CC_ID'].astype(str).str.lstrip('0').str.upper()

    # Merge to fetch LE_Description as new column
    main_df = main_df.merge(
        ospd_df[['CC_ID', 'LE_Description']].rename(columns={
            'CC_ID': 'Business Partner Cost Center',
            'LE_Description': 'Billing Entity name as per OSPD'
        }),
        on='Business Partner Cost Center',
        how='left'
    )

    return main_df
main_df = add_billing_entity_name_from_ospd(main_df, ospd_df)


Before refresh:
    PSID     Name      Entity Location
0   101    Alice  OldEntity1  OldLoc1
1   102      Bob  NewEntity2  NewLoc2
2   103  Charlie  OldEntity3  OldLoc3
3   104    David  OldEntity4  OldLoc4

After refresh:
    PSID     Name      Entity Location
0   101    Alice  OldEntity1  OldLoc1
1   102      Bob  NewEntity2  NewLoc2
2   103  Charlie  NewEntity3  NewLoc3
3   104    David  NewEntity4  NewLoc4


In [None]:
def reposition_psid_and_position_id(df: pd.DataFrame) -> pd.DataFrame:
    """
    Repositions 'PSID' column to column D (index 3) and 'Position ID' to column E (index 4).
    
    Parameters:
    - df: DataFrame where 'PSID' and 'Position ID' columns exist.

    Returns:
    - Modified DataFrame with swapped column positions.
    """
    try:
        cols = df.columns.tolist()

        if 'PSID' not in cols or 'Position ID' not in cols:
            raise ValueError("Both 'PSID' and 'Position ID' must be present in the dataframe.")

        # Remove them from the list
        cols.remove('PSID')
        cols.remove('Position ID')

        # Insert at desired positions
        cols.insert(3, 'PSID')         # Column D = index 3
        cols.insert(4, 'Position ID')  # Column E = index 4

        df = df[cols]
        print("✅ Successfully repositioned 'PSID' to column D and 'Position ID' to column E.")
        return df

    except Exception as e:
        print("❌ Error while repositioning columns:", e)
        raise
        
        
def refresh_columns_e_to_z(main_df: pd.DataFrame, gha_df: pd.DataFrame) -> pd.DataFrame:
    """
    Refreshes columns E to Z in main_df using the latest data from gha_df based on PSID.

    Parameters:
    - main_df: DataFrame containing the main input file (already filtered on PSID).
    - gha_df: DataFrame of the combined GHA file.

    Returns:
    - Updated main_df with refreshed columns E to Z.
    """

    import string

    try:
        # Get actual column names for Excel columns E to Z
        all_columns = main_df.columns.tolist()
        start_col = 4  # Column E is index 4
        end_col = 25   # Column Z is index 25 (inclusive)

        if len(all_columns) < end_col + 1:
            raise ValueError("Main input file does not have enough columns to cover E to Z (25 columns).")

        cols_to_refresh = all_columns[start_col:end_col + 1]

        # Ensure PSID is in both dataframes
        if 'PSID' not in main_df.columns or 'PSID' not in gha_df.columns:
            raise KeyError("'PSID' column must exist in both main_df and gha_df.")

        # Prepare gha_df with only PSID + the refresh columns (ensure those columns exist in GHA)
        missing_cols = [col for col in cols_to_refresh if col not in gha_df.columns]
        if missing_cols:
            raise ValueError(f"The following columns to refresh are missing in GHA file: {', '.join(missing_cols)}")

        refresh_data = gha_df[['PSID'] + cols_to_refresh].drop_duplicates(subset='PSID')

        # Merge on PSID to bring in new values
        main_df = main_df.drop(columns=cols_to_refresh)
        main_df = main_df.merge(refresh_data, on='PSID', how='left')

        print(f"✅ Columns E to Z refreshed successfully.")
        return main_df

    except Exception as e:
        print("❌ Error while refreshing columns E to Z:", e)
        raise
        
        
        
        
        
main_df = reposition_psid_and_position_id(main_df)
main_df = refresh_columns_e_to_z(main_df, combined_gha_df)


In [None]:
import pandas as pd
from tkinter import Tk, filedialog
import sys

# Initialize Tkinter root
Tk().withdraw()

try:
    # --- Step 1: Load Two GHA Files ---
    print("Select the FIRST GHA file")
    gha_file_1 = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xls")])
    
    print("Select the SECOND GHA file")
    gha_file_2 = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xls")])

    # Read both GHA files
    gha_df1 = pd.read_excel(gha_file_1)
    gha_df2 = pd.read_excel(gha_file_2)

    # Combine them
    combined_gha_df = pd.concat([gha_df1, gha_df2], ignore_index=True)

    # Make sure PSID column exists
    if 'PSID' not in combined_gha_df.columns:
        print("Error: 'PSID' column not found in GHA files.")
        input("Press Enter to exit...")
        sys.exit()

    combined_psids = combined_gha_df['PSID'].astype(str).unique()

    # --- Step 2: Load Main Input File ---
    print("Select the MAIN INPUT file (from last phase)")
    main_file = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xls")])
    
    main_df = pd.read_excel(main_file)

    if 'PSID' not in main_df.columns:
        print(f"'PSID' column not found in main input file: {main_file}")
        input("Press Enter to exit...")
        sys.exit()

    # Filter only rows where PSID is present in final GHA
    main_df['PSID'] = main_df['PSID'].astype(str)
    filtered_main_df = main_df[main_df['PSID'].isin(combined_psids)].copy()

    print(f"Filtered rows: {len(main_df) - len(filtered_main_df)} rows removed.")
    
    # OPTIONAL: Save filtered result to a temp file for next steps
    output_path = filedialog.asksaveasfilename(defaultextension=".xlsx", title="Save Filtered Main Input File")
    filtered_main_df.to_excel(output_path, index=False)
    print(f"Filtered file saved at: {output_path}")

except Exception as e:
    print("An error occurred:", str(e))
    input("Press Enter to exit...")
    sys.exit()


In [None]:
main input file for this phase is somewhat simplar to what we produced in last phase... as few new columns or some information given in that file will be changes my multiple ppl... after that we need to process that file. ok so thats the input file. 
delete all the psid lines which are not in the latest gha file for the gha file we have to combine the two ghas that will be selected by the user in the beginning of the file .


There are 5 input files one is the main input file which we created in the last phase and the second file is the combination of 2 GHA files that the user will select in the starting of the program and then we have to combine both of them one after another and then it will become a final GHA file that we will refer to. Third is the cognos report, that again user will select in the beginning of the code.  Fourth is the last month billing file. Fifth is the ospd file , which will be in .xlsb format(load only CC-List sheet and columns from that:  CC_ID, LE_Description.
1. delete all those rows from main input file where psid is not present in final gha file.
2. Refresh all columns data from column e to column Z in main input file basis the PSID from the final gha file.
3. in the main input file , update AID column values, fetch AVP, Manager Operations from cognos report based on Cost center of main input file with Cost Code column in cognos report.
4. Create a copy of the  column ‘Billed/Not Billed’ And  name this new column In main input file as ‘Category for Static’. We have to update the values of this column as follows:
	a). if column Employee Class is either ‘Intern’ or ‘External’ and Billed/Not Billed is ‘Not billed’ then ‘Category for Static’  should be set to ‘Others’
	b).if column ‘GCB Grade’ is ‘3’ or ‘MD’ and column ‘Billed/Not Billed’ is ‘Not billed’ then ‘Category for Static’  should be set to ‘Others’.
c). if column ‘GFOC Country’= ‘Electronic Data Process Mexico’ and column ‘Billing Entity’ =’GR_MX_Electronic Data Process Mexico, S.A de C.V.’  and columm ‘Billed/Not Billed’=’Billed’ then ‘Category for Static’  should be set to ‘Not Billed’.  Simialr to this we have 4 more pairs to be checked for true and in all these cases we have to update ‘Category for Static’  should be set to ‘Not Billed’.  
5. from main input file column ‘Business Partner Cost Center’ column remove ‘-‘ and any preceding zeroes. And then concatenate the resultant col values with ‘Billing Entity’ col values. Similar operation should be performed in last month billing file also (which will be uploaded by the user in the beginning of the program.  After concatenation do a lookup based onthese columns and find any new concatenated values which is not present in last month billing file. If any records are found then : update the Comment column (create Comment column if does not exist) with the comment : business partner Cost center and billing entity not found in last month file (or any better comment to state this situation). 
6. add a new column to main input file AS ‘Billing Entity name as per OSPD. For this based on ‘Business Partner Cost Center’ in main input file, fetch LE_Description col values from ospd basis CC_ID column of ospd file. 
7. now fetch all unique pairs of ‘Business Partner Cost Center’ and Billing Entity. Make sure that each ‘Business Partner Cost Center’ corresponds to only one Billing Entity value. If and ‘Business Partner Cost Center’ has more that one billing entity against it, then add comment ‘multiple billing entities found.’  If already there is a comment against this row, we can add a new column comment2 and then add comment2 to it. (as I need to keep all comments). Only multiple billing enities are legit where cost center starts with’9’.  Anyways we still need to comment, but this time add… biling cost center begins with digit 9.  Make sure ‘Business Partner Cost Center’  is of simple text type at the end. 
8. ‘Billing Contact PSID’ column in main input file should be 0f 8 digts, prefix 0 if needed (excluding entries where its IPAC. 
9. Each’ Billing Contact name should have exactly one billing contact PSID against it.  Add comments to those where more than one psid is found. Comment as “billing contact has duplicate names” . Here point to be noted is that there are chances that the values in the column Billing Contact name has email id mentioned in some of the rows and in some rows it’s the actual name, we shall write a code to identify  if a billing contact psid has more than one billin contact name values then try to identify if that email is of same person or not, as there are 90% chances that email will contain alteast one word from the name for example John Scena is the name, and inother row with same billing contact psid the name is Scena.J@abcs.com... Then we should be able to figure out that these 


 




In [None]:
import pandas as pd

# List of expected columns in the DataFrame (this should be the exact column names you expect)
expected_columns = ['psid', 'category', ' spoc', 'entity manager ps id', 'col2', 'col3 ']

def check_column_names(df):
    # Capture columns from the input DataFrame
    input_columns = df.columns.tolist()

    # Compare expected columns with input columns
    missing_columns = [col for col in expected_columns if col not in input_columns]
    extra_columns = [col for col in input_columns if col not in expected_columns]

    # Raise an error if there are missing or extra columns
    if missing_columns or extra_columns:
        error_message = ""
        
        if missing_columns:
            missing_columns_str = ", ".join(missing_columns)
            error_message += f"The following expected columns are missing or mismatched: {missing_columns_str}\n"
        
        if extra_columns:
            extra_columns_str = ", ".join(extra_columns)
            error_message += f"The following columns are extra and not expected: {extra_columns_str}\n"
        
        raise ValueError(error_message)

    print("All expected columns are present and matched.")

# Simulate loading an input file (replace with actual file reading logic)
input_data = {
    'PSID': [1, 2, 3],
    'Category': ['New', 'Old', 'New'],
    ' spoc': [None, 'SPOC1', 'SPOC2'],
    'Entity Manager PS ID': ['EM1', 'EM2', 'EM3'],
    'col2': ['Value1', 'Value2', 'Value3'],
    'col3 ': ['Data1', 'Data2', 'Data3']
}
df = pd.DataFrame(input_data)

# Check column names in the DataFrame
try:
    check_column_names(df)
except ValueError as e:
    print(e)


In [None]:
def write_logs_to_excel(workbook_path):
    wb = openpyxl.load_workbook(workbook_path)
    if 'Logs' in wb.sheetnames:
        del wb['Logs']
    ws = wb.create_sheet("Logs")

    # Sort logs
    logs.sort(key=lambda x: x[0])

    # Header
    ws.append(["Log Description", "Value"])
    header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")
    bold_font = Font(bold=True)

    for cell in ws[1]:
        cell.fill = header_fill
        cell.font = bold_font

    # Data rows
    for _, desc, val in logs:
        ws.append([desc, val])

    # Borders
    thin = Side(border_style="thin", color="000000")
    border = Border(left=thin, right=thin, top=thin, bottom=thin)
    for row in ws.iter_rows():
        for cell in row:
            cell.border = border

    # Set column width
    for col in range(1, 3):
        ws.column_dimensions[get_column_letter(col)].width = 25

    wb.save(workbook_path)


In [None]:
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Border, Side
from openpyxl.utils import get_column_letter

# Load the workbook and select the worksheet
wb = load_workbook("your_output_file.xlsx")
ws = wb["Sheet1"]  # change sheet name if needed

# Define styles
header_fill = PatternFill(start_color='ADD8E6', end_color='ADD8E6', fill_type='solid')  # Light Blue
bold_font = Font(bold=True)
thin_border = Border(
    left=Side(style='thin'), 
    right=Side(style='thin'), 
    top=Side(style='thin'), 
    bottom=Side(style='thin')
)

# Get max rows and columns
max_row = ws.max_row
max_col = ws.max_column

# Apply styles
for row in ws.iter_rows(min_row=1, max_row=max_row, max_col=max_col):
    for cell in row:
        # Borders for all cells
        cell.border = thin_border
        # Header styling
        if cell.row == 1:
            cell.fill = header_fill
            cell.font = bold_font

# Set column widths and apply autofilter
for col in range(1, max_col + 1):
    col_letter = get_column_letter(col)
    ws.column_dimensions[col_letter].width = 25

# Apply autofilter
ws.auto_filter.ref = ws.dimensions

# Save the updated workbook
wb.save("your_output_file.xlsx")


In [2]:
from openpyxl import load_workbook
from openpyxl.worksheet.datavalidation import DataValidation

# Load the workbook and select the sheet
file_path = "data.xlsx"a

wb = load_workbook(file_path)
ws = wb["Sheet3"]

# Define the dropdown options
dropdown_options = ['Option1', 'Option2', 'Option3', 'Option4', 'Option5', 'Option6']
dropdown_formula = '"' + ','.join(dropdown_options) + '"'

# Create data validation object
dv = DataValidation(type="list", formula1=dropdown_formula, showDropDown=False, showErrorMessage=True)
dv.error = "Please select a valid option from the list."
dv.errorTitle = "Invalid Selection"

# Column letter for "Reason for non billable" (D)
col_letter = "D"

# Apply dropdown while keeping existing values
for row in range(2, ws.max_row + 1):  # Assuming row 1 is the header
    cell = ws[f"{col_letter}{row}"]
    
    # If the cell is empty, set "Select"
    if cell.value is None or str(cell.value).strip() == "":
        cell.value = "Select"
    
    # Apply data validation to enforce selection from the list
    dv.add(cell)

# Add validation to the sheet
ws.add_data_validation(dv)

# Save the file
wb.save(file_path)
wb.close()


In [None]:
import pandas as pd
from itertools import product
from openpyxl import load_workbook

# Load the Excel file
file_path = "your_file.xlsx"  # Update with your actual file path

# Read the required columns from respective sheets
xls = pd.ExcelFile(file_path)
rtn_values = xls.parse("RTN")["RTN"].dropna().unique()
country_values = xls.parse("Location")["Country"].dropna().unique()
gcb_values = xls.parse("GCB")["GCB"].dropna().unique()

# Generate all possible combinations
combinations = list(product(rtn_values, country_values, gcb_values))

# Create a DataFrame
output_df = pd.DataFrame(combinations, columns=["RTN", "Country", "GCB"])

# Load the existing workbook
wb = load_workbook(file_path)

# Remove the existing "Output" sheet if it exists
if "Output" in wb.sheetnames:
    wb.remove(wb["Output"])

# Create a new sheet
ws = wb.create_sheet("Output")

# Write column headers
ws.append(["RTN", "Country", "GCB"])

# Write data rows
for row in output_df.itertuples(index=False, name=None):
    ws.append(row)

# Save the workbook
wb.save(file_path)

print("Output sheet created successfully.")


In [1]:
import pandas as pd
import math
import datetime
import tkinter as tk
from tkinter import messagebox

# Example source data with optional 'PID' column
data = {
    "FTE": [1, -1, 3, 0, None, 10],  # Note: Third row has invalid FTE for PID logic
    "Type": ["BC-TT", "Investments", "Saves", "Other", "Saves", "BC-TT"],
    "GM": [0, 0, 0, 0, 0, 1],
    "MD": [0, 0, 0, 0, 0, 0],
    "3": [0, 0, 0, 0, 0, 2],
    "4": [2.5, -4, 0, 0, 0, 0],
    "5": [0, 0, 0, 0, 0, 0],
    "6": [3, -6, 0, 0, 0, 3],
    "7": [1, -1, 1, 0, 0, 4],
    "8": [0, -1.3, 0, 0, 0, 0],
    "Entity": ["Entity1", "Entity2", "Entity3", "Entity4", "Entity5", "Entity6"],
    "Description": ["Example 1", "Example 2", "Example 3", "Example 4", "Example 5", "Example 6"],
    "Country": ["Country1", "Country2", "Country3", "Country4", "Country5", "Country6"],
    "RTN": ["RTN 123 - Example", "RTN 456 - Sample", "RTN 789 - Test", "RTN 101 - Case", "RTN 112 - Trial", "RTN 113 - Check"],
    "PID": [1001, 1002, 1003, None, None, 1004]  # PID is optional
}

source_df = pd.DataFrame(data)

# Columns to check for GCB values
gcb_columns = ["GM", "MD", "3", "4", "5", "6", "7", "8"]

# Clean and convert FTE column
source_df["FTE"] = pd.to_numeric(source_df["FTE"], errors="coerce")

# Check if 'PID' exists in source_df
if "PID" in source_df.columns:
    # Identify rows where PID exists but FTE is not 1 or -1
    invalid_pid_fte = source_df[(source_df["PID"].notnull()) & (~source_df["FTE"].isin([1, -1]))]

    if not invalid_pid_fte.empty:
        # Show a message box to notify the user about the issue
        root = tk.Tk()
        root.withdraw()  # Hide the main window
        messagebox.showerror(
            "FTE Mismatch",
            "There is a mismatch between PID and FTE values. \nFTE should be exactly 1 or -1 when PID is present."
        )
        root.destroy()
    else:
        # Only keep valid rows
        source_df = source_df[(source_df["PID"].isnull()) | (source_df["FTE"].isin([1, -1]))]

# Filter rows where FTE is valid and does not equal 0
filtered_df = source_df[
    source_df["FTE"].notnull() & (source_df["FTE"] != 0) & (source_df["Type"].isin(["BC-TT", "Investments", "Saves"]))
]

# Create the template dataframe with required columns
template_columns = [
    "Big Grid Stack", "AOP YEAR", "Business Framework RTN Code",
    "Business Framework", "Business Framework Group", "Country",
    "GCB", "Emp. Type", "Hiring Source", "FTE", "Start Month",
    "WPB", "GBM", "CMB", "Diver", "Entity", "Description", "Position ID"
] if "PID" in source_df.columns else [
    "Big Grid Stack", "AOP YEAR", "Business Framework RTN Code",
    "Business Framework", "Business Framework Group", "Country",
    "GCB", "Emp. Type", "Hiring Source", "FTE", "Start Month",
    "WPB", "GBM", "CMB", "Diver", "Entity", "Description"
]

template_df = pd.DataFrame(columns=template_columns)

# Process each row in the filtered dataframe
for _, row in filtered_df.iterrows():
    fte_value = row["FTE"]
    duplicate_fte = -1 if fte_value < 0 else 1  # Determine positive or negative FTE
    abs_fte = abs(fte_value)
    int_part = math.floor(abs_fte)  # Integer part of FTE
    fractional_part = abs_fte - int_part  # Fractional part of FTE

    # Calculate GCB distribution
    gcb_distribution = []
    for col in gcb_columns:
        value = pd.to_numeric(row[col], errors="coerce")  # Convert to numeric, handle non-numeric gracefully
        if not pd.isna(value) and value != 0:
            gcb_distribution.extend([col] * abs(math.ceil(value)))

    # Ensure GCB distribution matches the FTE rows
    total_required_rows = int_part + (1 if fractional_part > 0 else 0)
    if len(gcb_distribution) < total_required_rows:
        gcb_distribution.extend(["na"] * (total_required_rows - len(gcb_distribution)))

    # Dynamically calculate Start Month
    current_year = 2025
    if fte_value > 0:
        start_month = datetime.date(current_year, 1, 31).strftime("%m/%d/%Y")  # Example logic for positive FTE
    elif fte_value < 0:
        start_month = datetime.date(current_year, 2, 28).strftime("%m/%d/%Y")  # Example logic for negative FTE
    else:
        start_month = datetime.date(current_year, 12, 31).strftime("%m/%d/%Y")  # Default fallback

    # Create rows in the template
    for i in range(total_required_rows):
        current_fte = (
            fractional_part if i == total_required_rows - 1 and fractional_part > 0 else 1
        ) * duplicate_fte

        big_grid_stack = (
            "New Perm Position (within FRP) - Staff Drawdown" if current_fte > 0
            else "Forecast program/Other Saves-Saves Forecast Tracker"
        )
        if row["Type"] == "BC-TT":
            big_grid_stack = "Inter Boundary Changes"

        new_row = {
            "Big Grid Stack": big_grid_stack,
            "AOP YEAR": "FY25",
            "Business Framework RTN Code": row["RTN"].split()[0],
            "Business Framework": "",
            "Business Framework Group": "",
            "Country": row["Country"],
            "GCB": gcb_distribution[i],
            "Emp. Type": "FTE",
            "Hiring Source": "External",
            "FTE": current_fte,
            "Start Month": start_month,
            "WPB": "0%",
            "GBM": "100.00%",
            "CMB": "0%",
            "Diver": "",
            "Entity": row["Entity"],
            "Description": row["Description"],
        }

        # Add 'Position ID' if 'PID' exists
        if "PID" in source_df.columns:
            new_row["Position ID"] = row["PID"] if pd.notna(row["PID"]) else ""

        template_df = pd.concat([template_df, pd.DataFrame([new_row])], ignore_index=True)

# Output the result
print(template_df)


                                       Big Grid Stack AOP YEAR  \
0                              Inter Boundary Changes     FY25   
1   Forecast program/Other Saves-Saves Forecast Tr...     FY25   
2     New Perm Position (within FRP) - Staff Drawdown     FY25   
3     New Perm Position (within FRP) - Staff Drawdown     FY25   
4     New Perm Position (within FRP) - Staff Drawdown     FY25   
5                              Inter Boundary Changes     FY25   
6                              Inter Boundary Changes     FY25   
7                              Inter Boundary Changes     FY25   
8                              Inter Boundary Changes     FY25   
9                              Inter Boundary Changes     FY25   
10                             Inter Boundary Changes     FY25   
11                             Inter Boundary Changes     FY25   
12                             Inter Boundary Changes     FY25   
13                             Inter Boundary Changes     FY25   
14        

In [None]:
import pandas as pd

# Sample DataFrames
base_df = pd.DataFrame({
    'Position ID': [101, 102, 103, 104, 101],
    'Category': ['Existing', 'New', 'Existing', 'Existing', 'Existing'],
    'Col_A': [10, 20, 30, 40, 50], 
    'Col_B': ['A', 'B', 'C', 'D', 'E']
})

gha_df = pd.DataFrame({
    'Position ID': [101, 103, 104],
    'New_Col_A': [100, 300, 400], 
    'New_Col_B': ['X', 'Y', 'Z']
})

# Define columns to update (Map GHA column names to Base column names)
update_cols = {'New_Col_A': 'Col_A', 'New_Col_B': 'Col_B'}
gha_df = gha_df.rename(columns=update_cols)

# Filter only 'Existing' records
existing_mask = base_df['Category'] == 'Existing'
existing_df = base_df[existing_mask]

# Merge with GHA data on 'Position ID'
updated_data = existing_df.merge(gha_df, on='Position ID', how='left', suffixes=('', '_new'))

# Ensure duplicates have the same updated value
for old_col in update_cols.values():
    new_col = old_col + '_new'
    updated_data[old_col] = updated_data.groupby('Position ID')[new_col].transform('first')

# Drop extra columns
updated_data = updated_data.drop(columns=[col + '_new' for col in update_cols.values()])

# Replace updated rows in base_df
base_df.loc[existing_mask, update_cols.values()] = updated_data[update_cols.values()]

print(base_df)


In [None]:
import os
import time
import pandas as pd
import win32com.client as win32

# File Paths
input_file = r"C:\path\to\Reportnames.xlsx"  # Change this path
template_file = r"C:\path\to\Template.xlsx"  # Change this path
output_folder = r"C:\path\to\Output"  # Change this path

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Read Reportnames.xlsx using pandas
df = pd.read_excel(input_file, usecols=["RTN CODE CHILD", "Report Name"])

# Launch Excel
excel = win32.Dispatch("Excel.Application")
excel.Visible = False  # Set True for debugging
excel.DisplayAlerts = False  # Prevent popups

try:
    for index, row in df.iterrows():
        rtn_code = row["RTN CODE CHILD"]
        report_name = row["Report Name"]
        output_file = os.path.join(output_folder, f"{report_name}.xlsx")

        if os.path.exists(output_file):  # Skip if file already exists
            print(f"Skipping {report_name}.xlsx (already exists)")
            continue

        print(f"Processing: {report_name}")

        # Open the template file
        wb = excel.Workbooks.Open(template_file)
        ws = wb.Sheets("Setup")

        # Insert values in Setup sheet
        ws.Range("D3").Value = rtn_code
        ws.Range("D4").Value = report_name

        # Refresh workbook (with timeout handling)
        start_time = time.time()
        excel.CalculateFull()  # Refresh all data
        while time.time() - start_time < 30:  # Timeout of 30 sec
            time.sleep(1)  

        # Remove formulas by copying and pasting as values
        for sheet in wb.Sheets:
            sheet.Cells.Copy()
            sheet.Cells.PasteSpecial(Paste=win32.constants.xlPasteValues)

        excel.CutCopyMode = False  # Clear clipboard to prevent errors

        # Save final file
        wb.SaveAs(output_file)
        wb.Close(SaveChanges=False)

        print(f"✅ Saved: {output_file}")

except Exception as e:
    print(f"Error: {e}")

finally:
    excel.Quit()
    print("Processing complete!")


In [None]:
import pandas as pd
import os

# Define input/output paths
input_folder = "Input"
output_file = "JML_Report.xlsx"
input_file = os.path.join(input_folder, "your_excel_file.xlsx")  # Update your filename

# Load Excel file
df = pd.read_excel(input_file, sheet_name="Headcount Employee Detail", dtype=str)

# Convert required columns to string for lookups
df["Employee ID"] = df["Employee ID"].astype(str)
df["Functional Manager Employee ID"] = df["Functional Manager Employee ID"].astype(str)
df["BF Level 2 Name"] = df["BF Level 2 Name"].astype(str)

# Create a lookup dictionary for quick access
lookup_dict = df.set_index("Employee ID")[["Functional Manager Employee ID", "BF Level 2 Name"]].to_dict("index")

# Step 1: Filter for 'Central Managed Services' employees
df_cms = df[df["BF Level 2 Name"] == "Central Managed Services"].copy()

# Initialize columns for tracking hierarchy
for i in range(1, 6):  # Up to 5 levels
    df_cms[f"Manager{i}"] = ""
    df_cms[f"Check{i}"] = ""

# Function to recursively fetch Functional Manager and BF Level
def track_manager_hierarchy(emp_id, max_level=5):
    managers = []
    bf_levels = []
    
    for i in range(1, max_level + 1):
        manager_id = lookup_dict.get(emp_id, {}).get("Functional Manager Employee ID")
        bf_level = lookup_dict.get(emp_id, {}).get("BF Level 2 Name")

        if not manager_id or pd.isna(manager_id):
            break  # Stop if no manager exists
        
        managers.append(manager_id)
        bf_levels.append(bf_level)

        # Move to the next level (next manager)
        emp_id = manager_id  

    return managers, bf_levels

# Step 2: Iterate over each employee and fetch hierarchy details
for index, row in df_cms.iterrows():
    emp_id = row["Employee ID"]
    managers, bf_levels = track_manager_hierarchy(emp_id, max_level=5)

    for i, (manager, bf_level) in enumerate(zip(managers, bf_levels), start=1):
        df_cms.at[index, f"Manager{i}"] = manager
        df_cms.at[index, f"Check{i}"] = bf_level

# Step 3: Identify rows where any "Check" column has 'Finance'
check_cols = [f"Check{i}" for i in range(1, 6)]
df_cms["Updated BF Level2"] = df_cms[check_cols].apply(lambda x: "CMS Finance" if "Finance" in x.values else "", axis=1)

# Step 4: Save full df_cms (with all original + Check/Manager columns) in "CMS_Details"
with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
    df_cms.to_excel(writer, sheet_name="CMS_Details", index=False)  # Save full CMS details for reference

    # Step 5: Filter rows where 'Updated BF Level2' is 'CMS Finance'
    cms_df = df_cms[df_cms["Updated BF Level2"] == "CMS Finance"].copy()

    # Step 6: Read original sheet again for Finance rows
    df_finance = df[df["BF Level 2 Name"] == "Finance"].copy()
    df_finance["Updated BF Level2"] = "Finance"

    # Step 7: Remove extra columns before appending to `df_finance`
    cms_df_cleaned = cms_df[df_finance.columns]

    # Step 8: Reset index types for proper concatenation
    cms_df_cleaned = cms_df_cleaned.reset_index(drop=True)
    df_finance = df_finance.reset_index(drop=True)

    # Step 9: Concatenate cleaned `cms_df` with `df_finance`
    final_df = pd.concat([df_finance, cms_df_cleaned], ignore_index=True)

    # Step 10: Save the final `JML Report`
    final_df.to_excel(writer, sheet_name="JML Report", index=False)

print("✅ Process completed successfully! The final file is saved as:", output_file)


In [None]:
import pandas as pd

# Load Files
output_file = pd.read_excel('Output.xlsx', sheet_name='Sheet1')
input_file = pd.read_excel('Input.xlsx', sheet_name='Sheet1')
mapping_file = pd.ExcelFile('Mapping.xlsx')
ospd_file = pd.read_excel('OSPD.xlsb', sheet_name='CC-List')

# Load Mapping Sheets
gr_mapping = mapping_file.parse('GR')
anaplan_mapping = mapping_file.parse('Anaplan')
country_mapping = mapping_file.parse('Country Mapping')

# Step 2: Add 'Classification' to Output File
output_file = output_file.merge(gr_mapping[['Role Type', 'Classification']], on='Role Type', how='left')

# Step 3: Add 'Classification' to Input File
input_file = input_file.merge(anaplan_mapping[['Position class', 'Classification']], left_on='Role Type', right_on='Position class', how='left')

# Step 4: Add 'Country' to Input File based on 'Country R3'
input_file = input_file.merge(country_mapping[['Country R3', 'Country']], on='Country R3', how='left')

# Step 5: Add 'RTN Code' from OSPD file
input_file = input_file.merge(ospd_file[['L4_BF_Description', 'L4_BF_ID']], left_on='Business Framework', right_on='L4_BF_Description', how='left')
input_file.rename(columns={'L4_BF_ID': 'RTN Code'}, inplace=True)

# Step 6: Create Unique Key for Input File
input_file['Unique Key'] = input_file['Classification'] + ' ' + input_file['RTN Code'] + ' ' + input_file['Country']

# Step 7: Create Unique Key for Output File
output_file['Unique Key'] = output_file['Classification'] + ' ' + output_file['RTN Code L4'] + ' ' + output_file['Country']

# Step 8: Create Secondary Keys
input_file['Secondary Key'] = input_file['RTN Code'] + ' ' + input_file['Country']
output_file['Secondary Key'] = output_file['RTN Code L4'] + ' ' + output_file['Country']

# Step 9: Lookup Data from Input to Output File
lookup_cols = ['Business Framework', 'Country', 'Classification']
out_cols = ['A_L4', 'A_Country', 'A_Role Type']
output_file[out_cols] = output_file[['Unique Key']].merge(input_file[['Unique Key'] + lookup_cols], on='Unique Key', how='left')[lookup_cols]

# Step 10: Aggregate Cost & FTE based on Unique and Secondary Keys
agg_data = input_file.groupby('Unique Key').agg({'Cost': 'sum', 'FTE': 'sum'}).reset_index()
output_file = output_file.merge(agg_data, on='Unique Key', how='left').rename(columns={'Cost': 'A_DC', 'FTE': 'A_FTE'})

# Aggregate for Secondary Key
agg_data_sec = input_file.groupby('Secondary Key').agg({'Cost': 'sum', 'FTE': 'sum'}).reset_index()
output_file = output_file.merge(agg_data_sec, on='Secondary Key', how='left').rename(columns={'Cost': 'Total_DC', 'FTE': 'Total_FTE'})

# Save the final processed Output File
output_file.to_excel('Processed_Output.xlsx', index=False)

print("Processing completed. Output saved as 'Processed_Output.xlsx'")


In [None]:
import pandas as pd
import os

# Define input and output file paths
input_folder = "Input"
input_file = os.path.join(input_folder, "your_excel_file.xlsx")  # Replace with actual filename
output_file = "JML_Report.xlsx"

# Read the Excel file
df = pd.read_excel(input_file, sheet_name="Headcount Employee Detail", dtype=str)  # Read everything as text

# Filter only "Central Managed Services"
df_cms = df[df["BF Level 2 Name"] == "Central Managed Services"].copy()

# Initialize columns for tracking manager levels
manager_cols = ["Check", "Manager", "Check2", "Manager2", "Check3", "Manager3", "Check4", "Manager4", "Check5", "Manager5", "Check6"]

# Add empty columns
for col in manager_cols:
    df_cms[col] = ""

# Create a dictionary for fast lookups (Employee ID → [Manager ID, BF Level])
lookup_dict = df.set_index("Employee ID")[["Functional Manager Employee ID", "BF Level 2 Name"]].to_dict(orient="index")

# Function to track hierarchy up to 5 levels
def track_manager_hierarchy(emp_id):
    manager_data = {"Check": "", "Manager": "", "Check2": "", "Manager2": "", "Check3": "", "Manager3": "", "Check4": "", "Manager4": "", "Check5": "", "Manager5": "", "Check6": ""}
    current_emp = emp_id

    for i in range(1, 7):  # Loop for levels 1 to 6
        if current_emp in lookup_dict:
            manager_id = lookup_dict[current_emp]["Functional Manager Employee ID"]
            bf_level = lookup_dict[current_emp]["BF Level 2 Name"]

            # Save in corresponding columns
            if i == 1:
                manager_data["Check"] = bf_level
                manager_data["Manager"] = manager_id
            else:
                manager_data[f"Check{i}"] = bf_level
                manager_data[f"Manager{i}"] = manager_id
            
            # Move up the hierarchy
            current_emp = manager_id
        else:
            break  # Stop if no manager found

    return pd.Series(manager_data)

# Apply the function to populate manager levels
df_cms[manager_cols] = df_cms["Employee ID"].apply(track_manager_hierarchy)

# Identify rows where any "Check" column has 'Finance'
df_cms["Updated BF Level2"] = df_cms[["Check", "Check2", "Check3", "Check4", "Check5", "Check6"]].apply(lambda x: "CMS Finance" if "Finance" in x.values else "", axis=1)

# Keep only relevant rows
cms_df = df_cms[df_cms["Updated BF Level2"] == "CMS Finance"]

# Read Finance rows from original file
df_finance = df[df["BF Level 2 Name"] == "Finance"].copy()
df_finance["Updated BF Level2"] = "Finance"

# Append cms_df rows to finance rows
final_df = pd.concat([df_finance, cms_df], ignore_index=True)

# Save to a new Excel file with sheet name "JML Report"
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    final_df.to_excel(writer, sheet_name="JML Report", index=False)

print("✅ JML Report generated successfully!")


In [None]:
import pandas as pd
from openpyxl.styles import PatternFill
from openpyxl import load_workbook
from datetime import datetime

# Define the yellow fill for highlighting
yellow_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")

# Load your Excel file
file_path = "your_file.xlsx"
df = pd.read_excel(file_path)
wb = load_workbook(file_path)
ws = wb.active

# Ensure all expected columns are present
required_columns = ['Original Date', 'Date', 'Type', 'Description']  # Add more columns as needed
for col in required_columns:
    if col not in df.columns:
        df[col] = ""  # Add missing columns with blank values

# Function to safely convert date values
def safe_convert_date(date_value):
    if pd.isnull(date_value):  # Handle missing values
        return None
    if isinstance(date_value, datetime):  # Already a datetime object
        return date_value
    if isinstance(date_value, str):  # Try parsing strings
        for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%m/%d/%Y"):  # Common date formats
            try:
                return datetime.strptime(date_value, fmt)
            except ValueError:
                continue
    return None  # Return None if conversion fails

# Ensure Date and Original Date are in proper datetime format
if 'Date' in df.columns:
    df['Date'] = df['Date'].apply(safe_convert_date)
if 'Original Date' in df.columns:
    df['Original Date'] = df['Original Date'].apply(safe_convert_date)

# Function to process Date changes
def process_dates(row, current_date):
    original_date = row.get('Original Date')
    date = row.get('Date')
    
    # Check if dates are valid
    if pd.isna(date) or (original_date and (original_date.year < current_date.year or original_date.month < current_date.month)):
        return current_date.replace(day=1) - pd.Timedelta(days=1)  # Last day of current month
    return date

# Get today's date
today = pd.Timestamp.now()
current_month_last_date = today.replace(day=1) + pd.offsets.MonthEnd(0)

# Apply logic to update Date column and highlight changes
for index, row in df.iterrows():
    updated_date = process_dates(row, current_month_last_date)
    if updated_date != row['Date']:  # Track changes
        df.at[index, 'Date'] = updated_date
        ws.cell(row=index + 2, column=df.columns.get_loc('Date') + 1).value = updated_date  # Update cell value
        ws.cell(row=index + 2, column=df.columns.get_loc('Date') + 1).fill = yellow_fill  # Highlight updated cell

# Handle 'Type' and 'Description' specific logic only if the columns exist
if 'Type' in df.columns:
    # Example logic for 'Type'
    df['Type'] = df['Type'].apply(lambda x: x.upper() if pd.notna(x) else "")
if 'Description' in df.columns:
    # Example logic for 'Description'
    df['Description'] = df['Description'].fillna("No description available")

# Save the updated Excel file
wb.save("updated_file.xlsx")


In [None]:
import pandas as pd
import math
import datetime

# Example source data
data = {
    "FTE": [6.5, -12.3, 3, 0, None, 10],
    "Type": ["BC-TT", "Investments", "Saves", "Other", "Saves", "BC-TT"],
    "GM": [0, 0, 0, 0, 0, 1],
    "MD": [0, 0, 0, 0, 0, 0],
    "3": [0, 0, 0, 0, 0, 2],
    "4": [2.5, -4, 0, 0, 0, 0],
    "5": [0, 0, 0, 0, 0, 0],
    "6": [3, -6, 0, 0, 0, 3],
    "7": [1, -1, 1, 0, 0, 4],
    "8": [0, -1.3, 0, 0, 0, 0],
    "Entity": ["Entity1", "Entity2", "Entity3", "Entity4", "Entity5", "Entity6"],
    "Description": ["Example 1", "Example 2", "Example 3", "Example 4", "Example 5", "Example 6"],
    "Country": ["Country1", "Country2", "Country3", "Country4", "Country5", "Country6"],
    "RTN": ["RTN 123 - Example", "RTN 456 - Sample", "RTN 789 - Test", "RTN 101 - Case", "RTN 112 - Trial", "RTN 113 - Check"]
}

source_df = pd.DataFrame(data)

# Columns to check for GCB values
gcb_columns = ["GM", "MD", "3", "4", "5", "6", "7", "8"]

# Clean and convert FTE column
source_df["FTE"] = pd.to_numeric(source_df["FTE"], errors="coerce")

# Filter rows where FTE is valid and does not equal 0
filtered_df = source_df[
    source_df["FTE"].notnull() & (source_df["FTE"] != 0) & (source_df.get("Type", "").isin(["BC-TT", "Investments", "Saves"]))
]

# Create the template dataframe with required columns
template_columns = [
    "Big Grid Stack", "AOP YEAR", "Business Framework RTN Code",
    "Business Framework", "Business Framework Group", "Country",
    "GCB", "Emp. Type", "Hiring Source", "FTE", "Start Month",
    "WPB", "GBM", "CMB", "Diver", "Entity", "Description"
]
template_df = pd.DataFrame(columns=template_columns)

# Process each row in the filtered dataframe
for _, row in filtered_df.iterrows():
    fte_value = row["FTE"]
    duplicate_fte = -1 if fte_value < 0 else 1  # Determine positive or negative FTE
    abs_fte = abs(fte_value)
    int_part = math.floor(abs_fte)  # Integer part of FTE
    fractional_part = abs_fte - int_part  # Fractional part of FTE

    # Calculate GCB distribution
    gcb_distribution = []
    for col in gcb_columns:
        value = pd.to_numeric(row.get(col), errors="coerce")  # Convert to numeric, handle non-numeric gracefully
        if not pd.isna(value) and value != 0:
            gcb_distribution.extend([col] * abs(math.ceil(value)))

    # Ensure GCB distribution matches the FTE rows
    total_required_rows = int_part + (1 if fractional_part > 0 else 0)
    if len(gcb_distribution) < total_required_rows:
        gcb_distribution.extend(["na"] * (total_required_rows - len(gcb_distribution)))

    # Dynamically calculate Start Month
    current_year = 2025
    if fte_value > 0:
        start_month = datetime.date(current_year, 1, 31)  # Example logic for positive FTE
    elif fte_value < 0:
        start_month = datetime.date(current_year, 2, 28)  # Example logic for negative FTE
    else:
        start_month = datetime.date(current_year, 12, 31)  # Default fallback

    # Create rows in the template
    for i in range(total_required_rows):
        current_fte = (
            fractional_part if i == total_required_rows - 1 and fractional_part > 0 else 1
        ) * duplicate_fte

        big_grid_stack = (
            "New Perm Position (within FRP) - Staff Drawdown" if current_fte > 0
            else "Forecast program/Other Saves-Saves Forecast Tracker"
        )
        if row.get("Type") == "BC-TT":
            big_grid_stack = "Inter Boundary Changes"

        new_row = {
            "Big Grid Stack": big_grid_stack,
            "AOP YEAR": "FY25",
            "Business Framework RTN Code": row["RTN"].split()[0],
            "Business Framework": "",
            "Business Framework Group": "",
            "Country": row.get("Country", ""),
            "GCB": gcb_distribution[i],
            "Emp. Type": "FTE",
            "Hiring Source": "External",
            "FTE": current_fte,
            "Start Month": start_month.strftime("%m/%d/%Y"),
            "WPB": "0%",
            "GBM": "100.00%",
            "CMB": "0%",
            "Diver": "",
            "Entity": row.get("Entity", ""),
            "Description": row.get("Description", "No description available"),
        }
        template_df = pd.concat([template_df, pd.DataFrame([new_row])], ignore_index=True)

# Output the result
print(template_df)


In [None]:
import pandas as pd
import datetime
import calendar
from openpyxl import load_workbook
from openpyxl.styles import PatternFill

def adjust_and_highlight_dates(input_file, output_file):
    # Read the Excel file into a dataframe
    df = pd.read_excel(input_file)

    # Get today's date, current year, and current month
    today = datetime.datetime.today()
    current_year = today.year
    current_month = today.month

    # Convert the Date column to datetime format
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')

    # Preserve the original dates for comparison
    df['Original Date'] = df['Date']

    # Iterate through the dataframe and update dates if necessary
    for index, row in df.iterrows():
        original_date = row['Original Date']
        if pd.notna(original_date):  # Ensure the date is valid
            # Check if the year is not current or the month is before the current month
            if original_date.year != current_year or original_date.month < current_month:
                # Get the last day of the current month
                last_day_of_month = datetime.date(current_year, current_month, calendar.monthrange(current_year, current_month)[1])
                df.at[index, 'Date'] = last_day_of_month  # Update the date

    # Save the updated dataframe to the Excel file
    df.to_excel(output_file, index=False)

    # Open the workbook to apply formatting
    wb = load_workbook(output_file)
    ws = wb.active

    # Apply yellow highlight to cells in the "Date" column where changes were made
    yellow_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
    for index, row in df.iterrows():
        if row['Original Date'] != row['Date']:  # Compare original and updated date
            excel_row = index + 2  # Offset for Excel's 1-based index and header row
            ws.cell(row=excel_row, column=2).fill = yellow_fill  # Column 2 corresponds to the 'Date' column

    # Save the workbook with highlighted changes
    wb.save(output_file)
    print("Dates adjusted and changes highlighted where necessary.")

# Example usage
input_file = "input_dates.xlsx"  # Replace with your input file
output_file = "output_dates.xlsx"  # Replace with your output file
adjust_and_highlight_dates(input_file, output_file)


In [None]:
import openpyxl
from openpyxl.styles import Alignment, Font, PatternFill
from openpyxl.utils import get_column_letter

def format_excel_with_headers(file_path):
    # Load the workbook
    workbook = openpyxl.load_workbook(file_path)
    
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        
        # Insert the sheet name as a heading in A1
        sheet.insert_rows(1)
        sheet['A1'] = sheet_name
        sheet.merge_cells(start_row=1, start_column=1, end_row=1, end_column=sheet.max_column)
        sheet['A1'].font = Font(size=14, bold=True)
        sheet['A1'].alignment = Alignment(horizontal="center", vertical="center")
        sheet['A1'].fill = PatternFill(start_color="B0C4DE", end_color="B0C4DE", fill_type="solid")  # Light Blue
        
        # Apply formatting to the header row (now at row 2)
        header_row = 2
        for col in range(1, sheet.max_column + 1):
            cell = sheet.cell(row=header_row, column=col)
            cell.font = Font(bold=True)
            cell.fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")  # Light Blue
            cell.alignment = Alignment(horizontal="center", vertical="center")
        
        # Set column widths
        for col in range(1, sheet.max_column + 1):
            max_length = 0
            column = get_column_letter(col)
            for row in range(1, sheet.max_row + 1):
                cell = sheet[f"{column}{row}"]
                try:
                    if cell.value:
                        max_length = max(max_length, len(str(cell.value)))
                except Exception:
                    pass
            adjusted_width = max_length + 2
            sheet.column_dimensions[column].width = adjusted_width

    # Save the updated workbook
    workbook.save(file_path)
    print(f"Workbook formatted and saved at: {file_path}")

# Example Usage
file_path = "organization_structure.xlsx"
format_excel_with_headers(file_path)


In [None]:
from pptx import Presentation
from pptx.util import Inches, Pt
import openpyxl

def add_table_slide(presentation, sheet_name, sheet):
    slide = presentation.slides.add_slide(presentation.slide_layouts[5])  # Blank slide
    title = slide.shapes.title
    title.text = f"Data for {sheet_name}"

    # Get the data
    data = [[cell.value for cell in row] for row in sheet.iter_rows(values_only=True)]
    rows, cols = len(data), len(data[0])

    # Add table
    left = Inches(1.0)
    top = Inches(1.5)
    width = Inches(8.0)
    height = Inches(5.0)
    table = slide.shapes.add_table(rows, cols, left, top, width, height).table

    # Fill the table with data
    for i, row in enumerate(data):
        for j, value in enumerate(row):
            cell = table.cell(i, j)
            cell.text = str(value) if value else ""
            cell.text_frame.paragraphs[0].font.size = Pt(10)

    # Format header row
    for cell in table.rows[0].cells:
        cell.text_frame.paragraphs[0].font.bold = True
        cell.fill.solid()
        cell.fill.fore_color.rgb = RGBColor(173, 216, 230)  # Light Blue

def generate_table_ppt(excel_file, ppt_file):
    # Load the Excel workbook
    workbook = openpyxl.load_workbook(excel_file)
    presentation = Presentation()

    # Iterate through sheets and process data
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        add_table_slide(presentation, sheet_name, sheet)

    # Save the PowerPoint
    presentation.save(ppt_file)
    print(f"PPT saved as {ppt_file}")

# Example Usage
input_excel = "formatted_organization_structure.xlsx"
output_ppt = "organization_structure_table.pptx"
generate_table_ppt(input_excel, output_ppt)


In [None]:
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE
from pptx.util import Inches, Pt
import openpyxl

def add_org_chart_slide(presentation, sheet_name, hierarchy):
    slide = presentation.slides.add_slide(presentation.slide_layouts[5])  # Blank slide
    title = slide.shapes.title
    title.text = f"Organizational Chart: {sheet_name}"

    # Add shapes for hierarchy
    top_margin = 1.5  # Inches
    left_margin = 1.0  # Inches
    shape_width = 2.0  # Inches
    shape_height = 1.0  # Inches
    level_gap = 1.5  # Vertical gap between levels
    horizontal_gap = 2.5  # Horizontal gap between shapes

    # Recursive function to draw hierarchy
    def draw_hierarchy(manager, x, y, slide):
        shape = slide.shapes.add_shape(
            MSO_SHAPE.RECTANGLE,
            Inches(x), Inches(y),
            Inches(shape_width), Inches(shape_height)
        )
        shape.text = manager
        shape.text_frame.paragraphs[0].font.size = Pt(12)
        return shape

    y = top_margin
    for manager1, sub_hierarchy in hierarchy.items():
        shape1 = draw_hierarchy(manager1, left_margin, y, slide)
        x = left_margin
        for manager2, employees in sub_hierarchy.items():
            y += level_gap
            shape2 = draw_hierarchy(manager2, x + horizontal_gap, y, slide)
            for i, employee in enumerate(employees):
                draw_hierarchy(employee, x + horizontal_gap * (i + 2), y + level_gap, slide)

def generate_org_chart_ppt(excel_file, ppt_file):
    # Load the Excel workbook
    workbook = openpyxl.load_workbook(excel_file)
    presentation = Presentation()

    # Iterate through sheets and process data
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        hierarchy = extract_hierarchy(sheet)
        add_org_chart_slide(presentation, sheet_name, hierarchy)

    # Save the PowerPoint
    presentation.save(ppt_file)
    print(f"PPT saved as {ppt_file}")

# Example Usage
input_excel = "formatted_organization_structure.xlsx"
output_ppt = "organization_structure_chart.pptx"
generate_org_chart_ppt(input_excel, output_ppt)


In [None]:
import openpyxl
from pptx import Presentation
from pptx.util import Pt
from pptx.dml.color import RGBColor

def extract_hierarchy(sheet):
    hierarchy = {}
    for row in sheet.iter_rows(min_row=3, values_only=True):  # Start reading data from row 3
        level1, level2, level3 = row[0:3], row[3:6], row[6:9]
        manager1 = " | ".join([str(x) for x in level1 if x])
        manager2 = " | ".join([str(x) for x in level2 if x])
        employee = " | ".join([str(x) for x in level3 if x])

        if manager1 not in hierarchy:
            hierarchy[manager1] = {}
        if manager2 not in hierarchy[manager1]:
            hierarchy[manager1][manager2] = []
        hierarchy[manager1][manager2].append(employee)

    return hierarchy

def add_slide_with_chart(presentation, sheet_name, hierarchy):
    slide = presentation.slides.add_slide(presentation.slide_layouts[5])  # Blank slide
    # Add title
    title = slide.shapes.title
    title.text = sheet_name
    title.text_frame.paragraphs[0].font.bold = True
    title.text_frame.paragraphs[0].font.size = Pt(24)

    # Add text box for the org chart
    left = Pt(50)
    top = Pt(100)
    width = Pt(800)
    height = Pt(500)
    text_box = slide.shapes.add_textbox(left, top, width, height)
    text_frame = text_box.text_frame
    text_frame.word_wrap = True
    text_frame.margin_left = Pt(10)
    text_frame.margin_top = Pt(10)

    # Build the org chart text
    text_frame.text = f"Organizational Chart for {sheet_name}\n"
    for manager1, sub_hierarchy in hierarchy.items():
        p = text_frame.add_paragraph()
        p.text = manager1
        p.font.bold = True
        p.font.size = Pt(16)
        p.font.color.rgb = RGBColor(0, 51, 102)  # Dark blue

        for manager2, employees in sub_hierarchy.items():
            p2 = text_frame.add_paragraph()
            p2.text = f"    {manager2}"
            p2.font.bold = False
            p2.font.size = Pt(14)
            p2.font.color.rgb = RGBColor(0, 102, 0)  # Green

            for employee in employees:
                p3 = text_frame.add_paragraph()
                p3.text = f"        {employee}"
                p3.font.size = Pt(12)
                p3.font.color.rgb = RGBColor(0, 0, 0)  # Black

def generate_organizational_ppt(excel_file, ppt_file):
    # Load the Excel workbook
    workbook = openpyxl.load_workbook(excel_file)
    presentation = Presentation()

    # Iterate through sheets and process data
    for sheet_name in workbook.sheetnames:
        sheet = workbook[sheet_name]
        hierarchy = extract_hierarchy(sheet)
        add_slide_with_chart(presentation, sheet_name, hierarchy)

    # Save the PowerPoint
    presentation.save(ppt_file)
    print(f"PPT saved as {ppt_file}")

# Input Excel and output PPT file paths
input_excel = "formatted_organization_structure.xlsx"
output_ppt = "organization_structure_presentation.pptx"

# Generate the presentation
generate_organizational_ppt(input_excel, output_ppt)


In [None]:
from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils import get_column_letter

def format_sheet(sheet):
    # Set the sheet name as the title in cell A1
    sheet_title = sheet.title
    sheet.merge_cells('A1:I1')  # Adjust range based on your data width
    sheet['A1'] = sheet_title
    sheet['A1'].font = Font(bold=True, size=14)
    sheet['A1'].alignment = Alignment(horizontal='center')

    # Style the header row
    header_fill = PatternFill(start_color="ADD8E6", end_color="ADD8E6", fill_type="solid")  # Light blue background
    header_font = Font(bold=True)
    header_alignment = Alignment(horizontal='center', vertical='center')

    # Assuming row 2 is the header row
    for cell in sheet[2]:  # Header row is the second row
        cell.fill = header_fill
        cell.font = header_font
        cell.alignment = header_alignment

    # Set fixed column width
    fixed_width = 20  # Set desired width
    for col in sheet.columns:
        col_letter = get_column_letter(col[0].column)  # Get column letter
        sheet.column_dimensions[col_letter].width = fixed_width

def format_workbook(file_path, output_path):
    # Load the workbook
    wb = load_workbook(file_path)

    # Format each sheet
    for sheet in wb.worksheets:
        format_sheet(sheet)

    # Save the formatted workbook
    wb.save(output_path)
    print(f"Formatted file saved as: {output_path}")

# Input and output file paths
input_file = "organization_structure.xlsx"
output_file = "formatted_organization_structure.xlsx"

# Call the function to format the workbook
format_workbook(input_file, output_file)


In [None]:
import pandas as pd
from openpyxl import Workbook

# Load the input Excel file
input_file = "your_input_file.xlsx"  # Replace with your input file path
output_file = "organization_structure.xlsx"  # Output file path

# Read the data
df = pd.read_excel(input_file)

# Ensure consistent column names (case-insensitive matching)
df.columns = df.columns.str.strip()

# Unique countries
countries = df['Work Location Country/Territory Name'].dropna().unique()

# Create a new workbook
wb = Workbook()
wb.remove(wb.active)  # Remove the default sheet

# Function to find subordinates
def find_subordinates(df, manager_name):
    return df[df['Entity Manager Employee Name'] == manager_name]

# Process each country
for country in countries:
    # Filter data for the country
    country_df = df[df['Work Location Country/Territory Name'] == country]
    # Get unique BF Level 4 Names for the country
    bf_level_4_names = country_df['BF Level 4 Name'].dropna().unique()

    # Create a sheet for this country
    for bf_level_4_name in bf_level_4_names:
        # Filter data for the BF Level 4 Name
        bf_df = country_df[country_df['BF Level 4 Name'] == bf_level_4_name]

        # Sort data by Global Career Band (MD > 3 > 4 > ... > 8)
        bf_df['Global Career Band'] = bf_df['Global Career Band'].astype(str)  # Ensure values are strings
        sorted_bf_df = bf_df.sort_values('Global Career Band', key=lambda col: col.map(lambda x: 'MD' if x == 'MD' else int(x) if x.isdigit() else 999))

        # Prepare data for three levels
        rows = []
        for _, level_1 in sorted_bf_df.iterrows():
            level_1_details = [level_1['Employee Name'], level_1['Global Career Band'], level_1['Position Title']]
            level_2_df = find_subordinates(df, level_1['Employee Name'])

            if not level_2_df.empty:
                first_level_1 = True  # To track if Level 1 details have been added
                for _, level_2 in level_2_df.iterrows():
                    level_2_details = [level_2['Employee Name'], level_2['Global Career Band'], level_2['Position Title']]
                    level_3_df = find_subordinates(df, level_2['Employee Name'])

                    if not level_3_df.empty:
                        first_level_2 = True  # To track if Level 2 details have been added
                        for _, level_3 in level_3_df.iterrows():
                            level_3_details = [level_3['Employee Name'], level_3['Global Career Band'], level_3['Position Title']]
                            rows.append((level_1_details if first_level_1 else ["", "", ""]) +
                                        (level_2_details if first_level_2 else ["", "", ""]) +
                                        level_3_details)
                            first_level_1 = False
                            first_level_2 = False
                    else:
                        rows.append((level_1_details if first_level_1 else ["", "", ""]) + level_2_details + ["", "", ""])
                        first_level_1 = False
            else:
                rows.append(level_1_details + ["", "", ""] + ["", "", ""])

        # Write data to the sheet
        sheet_name = f"{bf_level_4_name} - {country}"
        sheet_name = sheet_name[:31]  # Excel sheet names have a max length of 31
        sheet = wb.create_sheet(title=sheet_name)
        # Write headers
        sheet.append(["Level 1 Employee Name", "Global Career Band", "Position Title",
                      "Level 2 Employee Name", "Global Career Band", "Position Title",
                      "Level 3 Employee Name", "Global Career Band", "Position Title"])
        # Write rows
        for row in rows:
            sheet.append(row)

# Save the workbook
wb.save(output_file)

print(f"Organization structure saved to {output_file}")


In [1]:
import pandas as pd
import math
import datetime

# Example source data
data = {
    "FTE": [6.5, -12.3, 3, 0, None, 10],
    "Type": ["BC-TT", "Investments", "Saves", "Other", "Saves", "BC-TT"],
    "GM": [0, 0, 0, 0, 0, 1],
    "MD": [0, 0, 0, 0, 0, 0],
    "3": [0, 0, 0, 0, 0, 2],
    "4": [2.5, -4, 0, 0, 0, 0],
    "5": [0, 0, 0, 0, 0, 0],
    "6": [3, -6, 0, 0, 0, 3],
    "7": [1, -1, 1, 0, 0, 4],
    "8": [0, -1.3, 0, 0, 0, 0],
    "Entity": ["Entity1", "Entity2", "Entity3", "Entity4", "Entity5", "Entity6"],
    "Description": ["Example 1", "Example 2", "Example 3", "Example 4", "Example 5", "Example 6"],
    "Country": ["Country1", "Country2", "Country3", "Country4", "Country5", "Country6"],
    "RTN": ["RTN 123 - Example", "RTN 456 - Sample", "RTN 789 - Test", "RTN 101 - Case", "RTN 112 - Trial", "RTN 113 - Check"]
}

source_df = pd.DataFrame(data)

# Columns to check for GCB values
gcb_columns = ["GM", "MD", "3", "4", "5", "6", "7", "8"]

# Clean and convert FTE column
source_df["FTE"] = pd.to_numeric(source_df["FTE"], errors="coerce")

# Filter rows where FTE is valid and does not equal 0
filtered_df = source_df[
    source_df["FTE"].notnull() & (source_df["FTE"] != 0) & (source_df["Type"].isin(["BC-TT", "Investments", "Saves"]))
]

# Create the template dataframe with required columns
template_columns = [
    "Big Grid Stack", "AOP YEAR", "Business Framework RTN Code",
    "Business Framework", "Business Framework Group", "Country",
    "GCB", "Emp. Type", "Hiring Source", "FTE", "Start Month",
    "WPB", "GBM", "CMB", "Diver", "Entity", "Description"
]
template_df = pd.DataFrame(columns=template_columns)

# Process each row in the filtered dataframe
for _, row in filtered_df.iterrows():
    fte_value = row["FTE"]
    duplicate_fte = -1 if fte_value < 0 else 1  # Determine positive or negative FTE
    abs_fte = abs(fte_value)
    int_part = math.floor(abs_fte)  # Integer part of FTE
    fractional_part = abs_fte - int_part  # Fractional part of FTE

    # Calculate GCB distribution
    gcb_distribution = []
    for col in gcb_columns:
        value = pd.to_numeric(row[col], errors="coerce")  # Convert to numeric, handle non-numeric gracefully
        if not pd.isna(value) and value != 0:
            gcb_distribution.extend([col] * abs(math.ceil(value)))

    # Ensure GCB distribution matches the FTE rows
    total_required_rows = int_part + (1 if fractional_part > 0 else 0)
    if len(gcb_distribution) < total_required_rows:
        gcb_distribution.extend(["na"] * (total_required_rows - len(gcb_distribution)))

    # Dynamically calculate Start Month
    current_year = 2025
    if fte_value > 0:
        start_month = datetime.date(current_year, 1, 31).strftime("%m/%d/%Y")  # Example logic for positive FTE
    elif fte_value < 0:
        start_month = datetime.date(current_year, 2, 28).strftime("%m/%d/%Y")  # Example logic for negative FTE
    else:
        start_month = datetime.date(current_year, 12, 31).strftime("%m/%d/%Y")  # Default fallback

    # Create rows in the template
    for i in range(total_required_rows):
        current_fte = (
            fractional_part if i == total_required_rows - 1 and fractional_part > 0 else 1
        ) * duplicate_fte

        big_grid_stack = (
            "New Perm Position (within FRP) - Staff Drawdown" if current_fte > 0
            else "Forecast program/Other Saves-Saves Forecast Tracker"
        )
        if row["Type"] == "BC-TT":
            big_grid_stack = "Inter Boundary Changes"

        new_row = {
            "Big Grid Stack": big_grid_stack,
            "AOP YEAR": "FY25",
            "Business Framework RTN Code": row["RTN"].split()[0],
            "Business Framework": "",
            "Business Framework Group": "",
            "Country": row["Country"],
            "GCB": gcb_distribution[i],
            "Emp. Type": "FTE",
            "Hiring Source": "External",
            "FTE": current_fte,
            "Start Month": start_month,
            "WPB": "0%",
            "GBM": "100.00%",
            "CMB": "0%",
            "Diver": "",
            "Entity": row["Entity"],
            "Description": row["Description"],
        }
        template_df = pd.concat([template_df, pd.DataFrame([new_row])], ignore_index=True)

# Output the result
print(template_df)


                                       Big Grid Stack AOP YEAR  \
0     New Perm Position (within FRP) - Staff Drawdown     FY25   
1     New Perm Position (within FRP) - Staff Drawdown     FY25   
2     New Perm Position (within FRP) - Staff Drawdown     FY25   
3     New Perm Position (within FRP) - Staff Drawdown     FY25   
4     New Perm Position (within FRP) - Staff Drawdown     FY25   
5     New Perm Position (within FRP) - Staff Drawdown     FY25   
6     New Perm Position (within FRP) - Staff Drawdown     FY25   
7   Forecast program/Other Saves-Saves Forecast Tr...     FY25   
8   Forecast program/Other Saves-Saves Forecast Tr...     FY25   
9   Forecast program/Other Saves-Saves Forecast Tr...     FY25   
10  Forecast program/Other Saves-Saves Forecast Tr...     FY25   
11  Forecast program/Other Saves-Saves Forecast Tr...     FY25   
12  Forecast program/Other Saves-Saves Forecast Tr...     FY25   
13  Forecast program/Other Saves-Saves Forecast Tr...     FY25   
14  Foreca

In [None]:
import os
import pandas as pd

def process_hdpi_file(input_file, month_name):
    """
    Process the HDPI Excel file.
    Modify this function to include your specific processing logic.
    """
    # Load the Excel file
    df = pd.read_excel(input_file)

    # Perform your processing logic here
    # Example: Add a new column for demonstration
    df["Processed"] = f"Processed for {month_name}"

    return df

def process_files_in_folder(ifolder, ofolder):
    """
    Processes all Excel files in the given folder.
    """
    # Ensure output folder exists
    os.makedirs(ofolder, exist_ok=True)

    # Get the list of files in the input folder
    files = [f for f in os.listdir(ifolder) if f.endswith(".xlsx")]

    for file_name in files:
        try:
            # Extract the file path
            input_file_path = os.path.join(ifolder, file_name)

            # Extract the month name (last three letters before ".xlsx")
            month_name = file_name.split()[-1][:3]

            # Load and process the file
            processed_df = process_hdpi_file(input_file_path, month_name)

            # Save the processed file with the desired name in the output folder
            output_file_name = f"HDPI {month_name}.xlsx"
            output_file_path = os.path.join(ofolder, output_file_name)
            processed_df.to_excel(output_file_path, index=False)

            print(f"Processed and saved: {output_file_path}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Define input and output folder paths
ifolder = r"C:\path\to\input\folder"  # Replace with your input folder path
ofolder = r"C:\path\to\output\folder"  # Replace with your output folder path

# Process all files in the folder
process_files_in_folder(ifolder, ofolder)


In [7]:
import win32com.client

def convert_docx_to_pdf(docx_path, pdf_path):
    # Initialize Word application
    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False  # Run in the background

    # Open the .docx file
    doc = word.Documents.Open(docx_path)

    # Save as PDF
    doc.SaveAs(pdf_path, FileFormat=17)  # 17 corresponds to the PDF format

    # Close the document and quit Word
    doc.Close()
    word.Quit()

# Example usage
docx_file = r"C:\\Users\\KS\\Thesis.docx"
pdf_file = r"C:\\Users\\KS\\Thesis_test.pdf"
convert_docx_to_pdf(docx_file, pdf_file)


com_error: (-2147352567, 'Exception occurred.', (0, 'Microsoft Word', 'Command failed', 'wdmain11.chm', 36966, -2146824090), None)

In [27]:
import pandas as pd
from openpyxl import load_workbook

# Sample data for spocdf and olddf (replace these with actual data loading as needed)
spocdf = pd.DataFrame({
    'Billing Contact Name': ['Alice', 'Alice', 'Bob', 'Alice', 'Charlie', 'Charlie'],
    'Billing Contact PS ID': [101, 102, 201, 103, 301, 301]
})

olddf = pd.DataFrame({
    'Billing Contact Name': ['Alice', 'Bob', 'Charlie'],
    'Billing Contact PS ID': [101, 201, 301]  # Assuming these are the correct PS IDs
})

# Step 1: Merge spocdf with olddf to add the correct PS ID as a reference
merged_df = spocdf.merge(
    olddf,
    on='Billing Contact Name',
    how='left',
    suffixes=('', '_correct')
)

# Step 2: Filter for rows where the 'Billing Contact PS ID' does not match the 'Correct Billing Contact PS ID'
# Retain only records where there is a mismatch
mismatched_psid_df = merged_df[merged_df['Billing Contact PS ID'] != merged_df['Billing Contact PS ID_correct']].copy()

# Rename the correct column for clarity
mismatched_psid_df.rename(columns={'Billing Contact PS ID_correct': 'Correct Billing Contact PS ID'}, inplace=True)

# # Step 3: Write the filtered DataFrame with mismatched PS IDs to a new sheet in the workbook
# file_path = 'your_workbook.xlsx'  # Replace with your actual file path
# workbook = load_workbook(file_path)

# # Check if 'Duplicate PSID' sheet exists, and delete if it does
# if 'Duplicate PSID' in workbook.sheetnames:
#     del workbook['Duplicate PSID']
# workbook.save(file_path)  # Save after deleting to ensure it's applied

# # Append the new data to the workbook in 'Duplicate PSID'
# with pd.ExcelWriter(file_path, engine='openpyxl', mode='a') as writer:
#     mismatched_psid_df.to_excel(writer, sheet_name='Duplicate PSID', index=False)
print(mismatched_psid_df)
print("Filtered Duplicate PSID sheet with mismatched PS IDs has been created and saved.")


  Billing Contact Name  Billing Contact PS ID  Correct Billing Contact PS ID
1                Alice                    102                            101
3                Alice                    103                            101
Filtered Duplicate PSID sheet with mismatched PS IDs has been created and saved.


In [19]:
import pandas as pd
import numpy as np

# Sample DataFrames
df1 = pd.DataFrame({
    'A2': [101, 102, 103, 104],
    'C': ['new', 'old', 'new', 'old'],
    'SC': [None, 'value2', None, 'value4']
})

df2 = pd.DataFrame({
    'AA2': [101, 102, 103, 104],
    'SC': ['lookup_value1', 'lookup_value2', 'lookup_value3', 'lookup_value4']
})

# Step 1: Create a mask for rows where 'C' is 'new' and 'SC' is blank (None or NaN)
mask = (df1['C'] == 'new') & (df1['SC'].isna())

# Step 2: Create a lookup dictionary from df2 for easy access
lookup_dict = df2.set_index('AA2')['SC'].to_dict()

# Step 3: Use np.where() to update 'SC' column conditionally
df1['SC'] = np.where(
    mask,  # Only update where the condition is True
    df1['A2'].map(lookup_dict).fillna(df1['SC']),  # Map A2 to the lookup dict and fill NaN with existing SC
    df1['SC']  # Retain the existing SC values
)

# Output the updated df1
print(df1)


    A2    C             SC
0  101  new  lookup_value1
1  102  old         value2
2  103  new  lookup_value3
3  104  old         value4


In [1]:
import pandas as pd

# Sample DataFrame 1 (df1)
data1 = {'A': [101, 102, 103, 104],
         'C': ['X1', 'X2', 'X3', 'X4'],
         'E': ['Y11', 'Y2', 'Y1', 'Y4'],
         'D': [None, None, None, None]}  # Initially empty Column D
df1 = pd.DataFrame(data1)
print(df1)
# Sample DataFrame 2 (df2)
data2 = {'A': [101, 103, 105, 106],
         'C': ['X1', 'X3', 'X5', 'X6'],
         'E': ['Y11', 'Y2', 'Y4', 'Y1'],
         'LookupValue': ['Val1', 'Val3', 'Val5', 'Val6']}  # Column with values to fetch
df2 = pd.DataFrame(data2)
print(df2)

# Step 1: First Lookup - Based on Column 'C'
df1 = pd.merge(df1, df2[['C', 'LookupValue']], how='left', on='C')  # Merge on 'C'
df1['D'] = df1['D'].mask(df1['D'].isna(), df1['LookupValue'])  # Use `mask` to fill NaN in 'D' only
df1.drop(columns=['LookupValue'], inplace=True)  # Drop temporary column

# Step 2: Second Lookup - Based on Column 'E'
df1 = pd.merge(df1, df2[['E', 'LookupValue']], how='left', on='E')  # Merge on 'E'
df1['D'] = df1['D'].mask(df1['D'].isna(), df1['LookupValue'])  # Update only NaNs in 'D'
df1.drop(columns=['LookupValue'], inplace=True)  # Drop temporary column

# Step 3: Third Lookup - Based on Column 'A'
df1 = pd.merge(df1, df2[['A', 'LookupValue']], how='left', on='A')  # Merge on 'A'
df1['D'] = df1['D'].mask(df1['D'].isna(), df1['LookupValue'])  # Update only remaining NaNs in 'D'
df1.drop(columns=['LookupValue'], inplace=True)  # Drop temporary column

# Final Output
print(df1)


     A   C    E     D
0  101  X1  Y11  None
1  102  X2   Y2  None
2  103  X3   Y1  None
3  104  X4   Y4  None
     A   C    E LookupValue
0  101  X1  Y11        Val1
1  103  X3   Y2        Val3
2  105  X5   Y4        Val5
3  106  X6   Y1        Val6
     A   C    E     D
0  101  X1  Y11  Val1
1  102  X2   Y2  Val3
2  103  X3   Y1  Val3
3  104  X4   Y4  Val5


In [2]:
import pandas as pd
import os
from datetime import datetime

# Function to generate columns for all months of the year
def generate_monthly_columns(year_suffix, num_years=1):
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    columns = []
    
    for i in range(num_years):
        for month in months:
            columns.append(f"{month} {year_suffix + i}")
    
    return columns

# Function to update dataframe with monthly data from files
def update_monthly_data(df, folder_path, year_suffix):
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    current_month = datetime.now().month  # Get the current month (1-12)
    
    for month_idx in range(current_month):
        month_name = months[month_idx]  # Get month name
        col_name = f"{month_name} {year_suffix}"  # Column name to update
        
        # Read corresponding file for the current month if exists
        file_path = os.path.join(folder_path, f"{month_name}_{year_suffix}.csv")  # Assuming CSV format
        
        if os.path.exists(file_path):
            monthly_data = pd.read_csv(file_path)  # Adjust if files are in Excel
            
            # Assuming you have a common column like 'Position ID' to merge data
            if 'Position ID' in monthly_data.columns:
                # Merge or update existing data based on 'Position ID'
                df = df.merge(monthly_data[['Position ID', 'Value']], on='Position ID', how='left')
                
                # Assign merged 'Value' to the corresponding column (e.g., "Jan 24")
                df[col_name] = df['Value']
                df = df.drop(columns=['Value'])  # Remove the temporary 'Value' column
    
    return df

# Step 1: Generate columns for one year
columns_2024 = generate_monthly_columns(24)

# Step 2: Create an empty dataframe with 'Position ID' and the generated columns
df = pd.DataFrame(columns=['Position ID'] + columns_2024)

# Step 3: Simulate loading data from previous step (e.g., from GHA)
# Assuming 'Position ID' column is already there
# df = pd.read_csv('previous_step_data.csv')  # Or however you load it

# Step 4: Update DataFrame with monthly data up to the current month
folder_path = 'path_to_your_files'  # Specify the folder where monthly files are stored
df = update_monthly_data(df, folder_path, 24)

# Step 5: Save the final updated DataFrame to Excel or CSV
df.to_csv('final_output.csv', index=False)  # Adjust for Excel if needed


FileNotFoundError: [Errno 2] No such file or directory: 'jan_2024_data.csv'

In [1]:
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.worksheet.datavalidation import DataValidation

# Sample dataframe
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [24, 30, 22],
        'Gender': ['Female', 'Male', 'Male']}  # We will apply drop-down for Gender

df = pd.DataFrame(data)

# Save DataFrame to an Excel file
output_path = 'output_with_dropdown.xlsx'

# Create a workbook and add the dataframe to the sheet
wb = Workbook()
ws = wb.active

# Write the DataFrame to the worksheet
for row in dataframe_to_rows(df, index=False, header=True):
    ws.append(row)

# Define the list of allowed values for the drop-down
drop_down_list = ['Male', 'Female', 'Other']

# Create a DataValidation object for the drop-down list
dv = DataValidation(type="list", formula1=f'"{",".join(drop_down_list)}"', showDropDown=False)

# Add the data validation to the 'Gender' column (assuming it is in column C, starting from C2)
ws.add_data_validation(dv)
dv.add(f"C2:C{len(df) + 1}")  # Apply validation to the Gender column for all rows

# Save the workbook
wb.save(output_path)

print(f"Excel file saved with drop-down in column 'Gender'.")


Excel file saved with drop-down in column 'Gender'.


In [15]:
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.worksheet.datavalidation import DataValidation

# Sample dataframe
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [24, 30, 22],
        'Gender': ['Female', 'Male', '']}  # Leave empty so users can select from the dropdown

df = pd.DataFrame(data)

# Create a workbook and add the dataframe to the sheet
wb = Workbook()
ws = wb.active

# Write the DataFrame to the worksheet
for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), 1):
    ws.append(row)

# Define the list of allowed values for the drop-down
drop_down_list = ['Male', 'Female', 'Other']

# Create a DataValidation object for the drop-down list
dv = DataValidation(type="list", formula1=f'"{",".join(drop_down_list)}"', showDropDown=False)

# Optional: Add an input prompt when the user selects the cell
dv.prompt = "Select Gender from the list"
dv.promptTitle = "Gender Selection"

# Apply the data validation to the "Gender" column (assuming it's column C)
ws.add_data_validation(dv)

# Extend the drop-down for all rows in the "Gender" column (starting from C2)
max_row = len(df) + 100  # Adjust the number of rows if needed
dv.add(f"C2:C{max_row}")  # Apply the drop-down from C2 to C{max_row}

# Save the workbook
output_path = 'output_with_dropdown_visible.xlsx'
wb.save(output_path)

print(f"Excel file saved with a drop-down in the 'Gender' column for all rows.")


Excel file saved with a drop-down in the 'Gender' column for all rows.


In [1]:
import pandas as pd

# Assuming your data is in a CSV file or an Excel file
# Replace 'your_file.xlsx' with the path to your Excel file if you're reading from one
# df = pd.read_excel('your_file.xlsx')

# Sample DataFrame to simulate the input
data = {
    'Business Framework': ['Global CMB', 'Other Framework','wpb'],  # Sample values
    'Country R3': ['UK', 'US','IN'],  # Sample values
}

# Create a DataFrame from the sample data
df = pd.DataFrame(data)

# Prepare an empty list to collect the expanded rows
expanded_rows = []

# Loop through each row in the original DataFrame
for index, row in df.iterrows():
    # Repeat each row 8 times with GCB values from 0 to 7
    for gcb_value in range(8):
        # Append the row values with the new GCB value to the list
        expanded_rows.append({
            'Business Framework': row['Business Framework'],
            'Country R3': row['Country R3'],
            'GCB': gcb_value
        })

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Save the final DataFrame to a new Excel file or CSV
# expanded_df.to_excel('expanded_output.xlsx', index=False)
print(expanded_df)


   Business Framework Country R3  GCB
0          Global CMB         UK    0
1          Global CMB         UK    1
2          Global CMB         UK    2
3          Global CMB         UK    3
4          Global CMB         UK    4
5          Global CMB         UK    5
6          Global CMB         UK    6
7          Global CMB         UK    7
8     Other Framework         US    0
9     Other Framework         US    1
10    Other Framework         US    2
11    Other Framework         US    3
12    Other Framework         US    4
13    Other Framework         US    5
14    Other Framework         US    6
15    Other Framework         US    7
16                wpb         IN    0
17                wpb         IN    1
18                wpb         IN    2
19                wpb         IN    3
20                wpb         IN    4
21                wpb         IN    5
22                wpb         IN    6
23                wpb         IN    7


In [None]:
from selenium import webdriver
import time

# Step 1: Set up Edge WebDriver using basic options
options = webdriver.EdgeOptions()

# Attempt to launch Edge without specifying the Service
driver = webdriver.Edge(options=options)

# Step 2: Open a blank tab to avoid the slow homepage
driver.execute_script("window.open('');")
time.sleep(1)  # Wait for the new tab to open

# Step 3: Switch to the new tab
driver.switch_to.window(driver.window_handles[1])

# Step 4: Navigate to the target URL in the new tab
target_url = 'https://your-target-website.com'  # Replace with your actual URL
driver.get(target_url)

# Step 5: Close the default homepage tab (which is the first tab)
driver.switch_to.window(driver.window_handles[0])
driver.close()

# Step 6: Switch back to the target website tab
driver.switch_to.window(driver.window_handles[0])

# Step 7: Let the browser stay open for some time
time.sleep(10)

# Finally, close the browser
driver.quit()


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Path to your Microsoft Edge WebDriver (Make sure it matches your browser version)
edge_driver_path = "path/to/msedgedriver.exe"

# Set up Edge WebDriver
service = Service(edge_driver_path)
driver = webdriver.Edge(service=service)

# Open the URL
driver.get('https://example.com')  # Replace with your actual URL

# Wait for the page to load and for the SSO button to be clickable
WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'My organization's single sign-on')]"))
)

# Locate and click the "SSO" button
sso_button = driver.find_element(By.XPATH, "//button[contains(text(), 'My organization's single sign-on')]")
sso_button.click()

# Let the SSO process complete
# You can wait for the page to load after the login
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, "some_element_after_login"))  # Replace with an element that appears after login
)

# You can now proceed to interact with the logged-in page


In [None]:
def options_to_dict(options):
    options_dict = {}
    for key in dir(options):
        if not key.startswith('__'):
            options_dict[key] = getattr(options, key)
    return options_dict

edge_options = Options()
edge_options.use_chromium = True

options_dict = options_to_dict(edge_options)
options_json = json.dumps(options_dict)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Set up the browser and SSO settings
driver = webdriver.Chrome()  # Replace with your default browser

# Open the website and log in using default SSO
driver.get("https://example.com")  # Replace with the website URL

# Wait for the login process to complete
WebDriverWait(driver, 10).until(EC.title_contains("Logged in"))

# Navigate to the page with filters
driver.get("https://example.com/filters")  # Replace with the filter page URL

# Identify and interact with the filter elements
filter_elements = driver.find_elements_by_css_selector(".filter-option")
for element in filter_elements:
    # Perform the necessary actions to change the filter options
    element.click()  # Replace with the actual action required

# Click the ellipsis button to download the file
download_button = driver.find_element_by_css_selector(".download-button")
download_button.click()

# Specify the file path and name for the downloaded file
file_path = "/path/to/downloaded/file.csv"  # Replace with the desired file path and name

# Close the browser
driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time

# Initialize the browser
driver = webdriver.Chrome(service=Service('/path/to/chromedriver'))

# Open the website
driver.get('https://example.com')  # Replace with the actual URL

# Wait for the page to load
time.sleep(3)

# Find the filter element by its label text (XPath)
filter_element = driver.find_element(By.XPATH, "//em[text()='BF Level 1 Name']")

# Click on the filter element
filter_element.click()

# Additional interaction (if needed)

# Close the browser after operations
driver.quit()


In [None]:
import pandas as pd
from datetime import datetime

# Function to map 'Mxx' to actual periods
def map_month_to_period(month_str, year):
    months_map = {
        "M01": "Jan", "M02": "Feb", "M03": "Mar", "M04": "Apr", "M05": "May", 
        "M06": "Jun", "M07": "Jul", "M08": "Aug", "M09": "Sep", "M10": "Oct", 
        "M11": "Nov", "M12": "Dec"
    }
    month_abbr = months_map.get(month_str)
    return f"{month_abbr}-{year}"

# Process HC sheet and populate the 'Paste' sheet
def process_hc_sheet(df, paste_df, sheet_name, year):
    # Find the last 3 columns with 'Mxx' values for the months
    month_cols = [col for col in df.columns if 'M' in col]
    if len(month_cols) < 3:
        print(f"Not enough 'Mxx' columns found in {sheet_name}")
        return

    # Sort the columns to get the last three
    last_3_months = month_cols[-3:]
    
    for month_col in last_3_months:
        month_num = month_col[:3]  # Extract Mxx (e.g., M07)
        period = map_month_to_period(month_num, year)
        
        # Prepare the 'Paste' DataFrame row by row
        for _, row in df.iterrows():
            paste_row = {
                'File Name': sheet_name,
                'Entity': row['Entity'],
                'Function': row['RTN Level 4'],
                'FTE/Contractor': row[month_col],  # Value from the month column
                'Period': period,
                'Attribute Type': 'MoM',
            }
            # Append to 'Paste' DataFrame
            paste_df = paste_df.append(paste_row, ignore_index=True)
    
    return paste_df

# Main script to process the sheets
def consolidate_hc_sheets(sheets_dict, year):
    # Initialize the 'Paste' DataFrame
    paste_columns = [
        'File Name','Level 3','Level 4','Cost Grouping','Cost Type','Finance Region', 
        'Attribute Type','Period','Cost','FTE/Contractor','Country','Level 3.5','Level 4.5',
        'Entity','Function','Mapped Country','MICA'
    ]
    paste_df = pd.DataFrame(columns=paste_columns)

    # Iterate through the sheets
    for sheet_name, df in sheets_dict.items():
        if 'HC' in sheet_name:
            paste_df = process_hc_sheet(df, paste_df, sheet_name, year)

    return paste_df

# Example usage
# Assuming 'sheets_dict' contains all loaded sheets as DataFrames
sheets_dict = {
    'HC_Sheet1': pd.DataFrame(...),  # Replace with actual loaded sheet data
    'HC_Sheet2': pd.DataFrame(...),
    # Add more sheets as needed
}

# Call the function to process and consolidate HC sheets
final_paste_df = consolidate_hc_sheets(sheets_dict, 2024)
print(final_paste_df.head())


In [None]:
import pandas as pd
from pyxlsb import open_workbook
from openpyxl import Workbook

# Path to the xlsb file (wb2)
wb2_path = 'path_to_wb2.xlsb'

# New Excel workbook to store the sheets that are not excluded
new_wb = Workbook()
new_wb.remove(new_wb.active)  # Remove the default sheet

# Function to find the header row based on 'Entity', 'RTN Level 4', 'Account'
def find_header_row(sheet):
    header_row = None
    # Iterate through rows to find the correct header row
    for row_num, row in enumerate(sheet.rows()):
        values = [item.v for item in row]
        if len(values) >= 3 and values[0] == 'Entity' and values[1] == 'RTN Level 4' and values[2] == 'Account':
            header_row = row_num  # Store the correct header row index (0-based)
            break
    return header_row

# Function to process a sheet and clean it up
def process_sheet(sheet, sheet_name):
    data = []
    header_row = find_header_row(sheet)

    if header_row is None:
        print(f"Header row not found in sheet: {sheet_name}")
        return None

    # Read the sheet into a list of lists (rows of data)
    for row in sheet.rows():
        data.append([item.v for item in row])

    # Convert data to a pandas dataframe
    df = pd.DataFrame(data)

    # Check the values of E15 and F15 (columns with index 4 and 5)
    try:
        E15_val = str(df.iloc[header_row + 1, 4])  # Value in column E (0-indexed, so 4)
        F15_val = str(df.iloc[header_row + 1, 5])  # Value in column F (0-indexed, so 5)
        print(f"Debug: E15 value: {E15_val}, F15 value: {F15_val}")
    except IndexError:
        E15_val = ''
        F15_val = ''
        print(f"Debug: E15 or F15 index out of range")

    # Define possible month headers
    month_headers = ['M{:02d}'.format(i) for i in range(1, 13)]  # Generates 'M01' to 'M12'
    print(f"Debug: Month headers: {month_headers}")

    # Prepare headers for columns A to C from row 15
    headers_A_to_C = df.iloc[header_row + 1, :3].values  # Columns A to C headers from row 15
    print(f"Debug: Headers A to C: {headers_A_to_C}")

    if E15_val in month_headers and F15_val in month_headers:
        # Use row 15 for headers for all columns
        headers_D_onwards = df.iloc[header_row + 1, 3:].values  # Columns D onwards headers from row 15
        headers = list(headers_A_to_C) + list(headers_D_onwards)
        start_data_row = header_row + 2  # Data starts after the header row
        print("Debug: Using headers from row 15 for columns D onwards")
    else:
        # Use row 14 for columns D onwards
        headers_D_onwards = df.iloc[header_row, 3:].values  # Columns D onwards headers from row 14
        headers = list(headers_A_to_C) + list(headers_D_onwards)
        start_data_row = header_row + 1  # Data starts after row 14
        print("Debug: Using headers from row 14 for columns D onwards")

    # Set the correct headers for the dataframe
    df = df.iloc[start_data_row:].reset_index(drop=True)  # Keep only rows after the header
    df.columns = headers

    # Drop rows where all values are NaN or None
    df.dropna(how='all', inplace=True)

    return df

# Open the xlsb workbook and process valid sheets
with open_workbook(wb2_path) as wb2:
    for sheetname in wb2.sheets:
        # Exclude sheets with 'excl', 'Pivot', or 'TM1'
        if any(exclude_word.lower() in sheetname.lower() for exclude_word in ['excl', 'pivot', 'tm1']):
            print(f"Skipping sheet: {sheetname}")
            continue

        with wb2.get_sheet(sheetname) as sheet:
            print(f"Processing sheet: {sheetname}")
            df = process_sheet(sheet, sheetname)

            if df is not None:
                # Save DataFrame to a new sheet in the new workbook
                new_sheet = new_wb.create_sheet(title=sheetname[:31])  # Sheet names max length is 31 characters

                # Write the headers to the sheet
                new_sheet.append(df.columns.tolist())

                # Write the data rows to the sheet
                for row in df.itertuples(index=False, name=None):
                    new_sheet.append(row)

# Save the new workbook with the non-excluded sheets
output_path = 'non_excluded_sheets_with_headers.xlsx'
new_wb.save(output_path)
print(f"Workbook with non-excluded sheets and correct headers has been saved to '{output_path}'")


In [None]:
import pandas as pd
from pyxlsb import open_workbook
from openpyxl import Workbook

# Path to the xlsb file (wb2)
wb2_path = 'path_to_wb2.xlsb'

# New Excel workbook to store the results
new_wb = Workbook()
new_wb.remove(new_wb.active)  # Remove the default sheet

# Function to find the header row based on 'Entity', 'RTN Level 4', 'Account'
def find_header_row(sheet):
    header_row = None
    # Iterate through rows to find the correct header row
    for row_num, row in enumerate(sheet.rows()):
        values = [item.v for item in row]
        if len(values) >= 3 and values[0] == 'Entity' and values[1] == 'RTN Level 4' and values[2] == 'Account':
            header_row = row_num  # Store the correct header row index (0-based)
            break
    return header_row

# Function to process a sheet and clean it up
def process_sheet(sheet, sheet_name):
    data = []
    header_row = find_header_row(sheet)

    if header_row is None:
        print(f"Header row not found in sheet: {sheet_name}")
        return None

    # Read the sheet into a list of lists (rows of data)
    for row in sheet.rows():
        data.append([item.v for item in row])

    # Convert data to a pandas dataframe
    df = pd.DataFrame(data)

    # Check the values of E15 and F15 (row 15 is 0-based index 14 in pandas)
    E15_val = str(df.iloc[header_row + 1, 4])  # Value in column E (0-indexed, so 4)
    F15_val = str(df.iloc[header_row + 1, 5])  # Value in column F (0-indexed, so 5)

    # Define possible month headers
    month_headers = ['M01', 'M02', 'M03', 'M04', 'M05', 'M06', 'M07', 'M08', 'M09', 'M10', 'M11', 'M12']

    # Adjust headers for columns D onwards based on the content of columns E and F
    if E15_val in month_headers and F15_val in month_headers:
        # Use row 15 for headers (index 14 in pandas)
        headers = df.iloc[header_row + 1].values  # Take headers from row 15
        start_data_row = header_row + 2  # Data starts after row 15
    else:
        # Use row 14 for headers (index 13 in pandas)
        headers = df.iloc[header_row].values  # Take headers from row 14
        start_data_row = header_row + 1  # Data starts after row 14

    # Set the correct headers for the dataframe
    df.columns = headers
    df = df[start_data_row:].reset_index(drop=True)  # Keep only rows after the header

    # Drop rows where all values are NaN or None
    df.dropna(how='all', inplace=True)
    
    return df

# Open the xlsb workbook and process valid sheets
with open_workbook(wb2_path) as wb2:
    for sheetname in wb2.sheets:
        # Exclude sheets with 'excl', 'Pivot', or 'TM1'
        if any(exclude_word.lower() in sheetname.lower() for exclude_word in ['excl', 'Pivot', 'TM1']):
            print(f"Skipping sheet: {sheetname}")
            continue

        with wb2.get_sheet(sheetname) as sheet:
            print(f"Processing sheet: {sheetname}")
            df = process_sheet(sheet, sheetname)
            
            if df is not None:
                # Save DataFrame to a new sheet in the new workbook
                new_sheet = new_wb.create_sheet(title=sheetname)
                for r in df.itertuples(index=False, name=None):
                    new_sheet.append(r)

# Save the new workbook with the updated data
output_path = 'updated_wb.xlsx'
new_wb.save(output_path)
print(f"Data has been written to '{output_path}'")


In [7]:
import tkinter as tk
from tkinter import filedialog, messagebox

class ExcelSheetSelector:
    def __init__(self, root):
        self.root = root
        self.root.title("Excel Sheet Selector")

        # Custom labels for each file selection
        self.custom_labels = [
            "Select OFile:", "Select CFile:", "Select DFile:", "Select Fourth File:",
            "Select Fifth File:", "Select Sixth File:", "Select Seventh File:", "Select Eighth File:"
        ]

        # List to hold selected file paths, initially None
        self.selected_files = [None] * 8
        self.labels = []

        # Create custom labels and buttons for selecting files
        for i in range(8):
            # Create a custom label for each file selection
            custom_label = tk.Label(root, text=self.custom_labels[i], width=20, anchor='w')
            custom_label.grid(row=i, column=0, padx=10, pady=5)

            # Create a label to display the selected file path
            label = tk.Label(root, text="Not selected", width=60, anchor='w')
            label.grid(row=i, column=1, padx=10, pady=5)
            self.labels.append(label)

            # Create a button for each file selection
            button = tk.Button(root, text="Browse", command=lambda index=i: self.select_file(index))
            button.grid(row=i, column=2, padx=10, pady=5)

        # Button to confirm file selections
        confirm_button = tk.Button(root, text="Confirm Selection", command=self.confirm_selection)
        confirm_button.grid(row=8, column=0, columnspan=3, pady=20)

        # Predefined variable names for file paths
        self.OFile = None
        self.CFile = None
        self.DFile = None
        self.file4 = None
        self.file5 = None
        self.file6 = None
        self.file7 = None
        self.file8 = None

    def select_file(self, index):
        # Open file dialog to select a file
        file_path = filedialog.askopenfilename(title="Select a File", filetypes=[("All Files", "*.*")])
        
        if file_path:  # If a file is selected
            self.selected_files[index] = file_path  # Save the selected file
            self.labels[index].config(text=file_path)  # Update label with the file path

    def confirm_selection(self):
        # Check how many files have been selected
        num_selected = sum(1 for file in self.selected_files if file)

        if num_selected == 0:
            messagebox.showwarning("Warning", "No files selected.")
        else:
            # Show confirmation dialog with options to Confirm or Select More Files
            result = messagebox.askquestion(
                "Confirmation", 
                f"You have selected {num_selected} files. Do you want to proceed?", 
                icon='question'
            )
            
            if result == 'yes':
                # Directly assign file paths to predefined variables
                self.assign_file_paths()
                messagebox.showinfo("Proceed", "Files are selected. Proceeding with the application.")
                # Print assigned files for verification
                print(self.OFile, self.CFile, self.DFile, self.file4, self.file5, self.file6, self.file7, self.file8)
                # Close the GUI
                self.root.destroy()
            else:
                # Allow the user to select more files
                messagebox.showinfo("Select More", "Please select more files.")

    def assign_file_paths(self):
        """Assigns selected file paths to predefined variables."""
        self.OFile = self.selected_files[0] if len(self.selected_files) > 0 else None
        self.CFile = self.selected_files[1] if len(self.selected_files) > 1 else None
        self.DFile = self.selected_files[2] if len(self.selected_files) > 2 else None
        self.file4 = self.selected_files[3] if len(self.selected_files) > 3 else None
        self.file5 = self.selected_files[4] if len(self.selected_files) > 4 else None
        self.file6 = self.selected_files[5] if len(self.selected_files) > 5 else None
        self.file7 = self.selected_files[6] if len(self.selected_files) > 6 else None
        self.file8 = self.selected_files[7] if len(self.selected_files) > 7 else None

# Initialize the Tkinter window and ExcelSheetSelector class
if __name__ == "__main__":
    root = tk.Tk()
    app = ExcelSheetSelector(root)
    root.mainloop()


C:/Users/KS/data.xlsx None None None None None C:/Users/KS/final_output.xlsx None


In [5]:
import tkinter as tk
from tkinter import filedialog, messagebox

def select_file(index):
    # Open file dialog to select a file
    file_path = filedialog.askopenfilename(title="Select a File", filetypes=[("All Files", "*.*")])
    
    if file_path:  # If a file is selected
        selected_files[index] = file_path  # Save the selected file
        labels[index].config(text=file_path)  # Update label with the file path

def confirm_selection():
    # Check how many files have been selected
    num_selected = sum(1 for file in selected_files if file)
    
    if num_selected == 0:
        messagebox.showwarning("Warning", "No files selected.")
    else:
        # Show confirmation dialog with options to Confirm or Select More Files
        result = messagebox.askquestion("Confirmation", f"You have selected {num_selected} files. Do you want to proceed?", icon='question')
        
        if result == 'yes':
            # Proceed and close the application
            messagebox.showinfo("Proceed", "Files are selected. Proceeding with the application.")
            root.destroy()
        else:
            # Allow the user to select more files
            messagebox.showinfo("Select More", "Please select more files.")

# Initialize the Tkinter window
root = tk.Tk()
root.title("File Selector")

# List to hold selected file paths
selected_files = [None] * 8
labels = []

# Create labels and buttons for selecting files
for i in range(8):
    # Create a label for each file selection
    label = tk.Label(root, text=f"File {i+1}: Not selected", width=100, anchor='w')
    label.grid(row=i, column=0, padx=10, pady=5)
    labels.append(label)
    
    # Create a button for each file selection
    button = tk.Button(root, text="Select File", command=lambda index=i: select_file(index))
    button.grid(row=i, column=1, padx=10, pady=5)

# Button to confirm file selections
confirm_button = tk.Button(root, text="Confirm Selection", command=confirm_selection)
confirm_button.grid(row=8, column=0, columnspan=2, pady=20)

root.mainloop()


In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox

def select_files():
    # Open the file dialog to select files
    file_paths = filedialog.askopenfilenames(title="Select Files", filetypes=[("All Files", "*.*")])
    
    # Update the listbox with selected files
    listbox.delete(0, tk.END)
    for file in file_paths:
        listbox.insert(tk.END, file)
    
    # Save the selected files to the global variable
    selected_files.clear()
    selected_files.extend(file_paths)
    
    # Confirm the number of selected files
    if len(selected_files) == 0:
        messagebox.showinfo("Information", "No files selected.")
    else:
        messagebox.showinfo("Information", f"You have selected {len(selected_files)} files.")

def confirm_selection():
    # Proceed with the application logic
    if len(selected_files) > 0:
        messagebox.showinfo("Proceed", "Files are selected. Proceeding with the application.")
    else:
        messagebox.showwarning("Warning", "No files selected. Proceeding with the application.")

# Initialize the Tkinter window
root = tk.Tk()
root.title("File Selector")

selected_files = []

# Create a Listbox to display selected files
listbox = tk.Listbox(root, width=100, height=10)
listbox.pack(pady=10)

# Add buttons to select files and confirm selection
select_button = tk.Button(root, text="Select Files", command=select_files)
select_button.pack(pady=5)

confirm_button = tk.Button(root, text="Confirm Selection", command=confirm_selection)
confirm_button.pack(pady=5)

root.mainloop()


In [1]:
import pandas as pd

# Sample DataFrame
data = {
    'Record Type': ['EUC', 'EUC', 'EUC', 'Other', 'EUC'],
    'Record ID': [1, 2, 3, 4, 5],
    'Record Name': ['Name1', 'Name2', 'Name3', 'Name4', 'Name5'],
    'Staff ID': [101, 102, 102, 102, 103],
    'Position Changed': ['Left', 'No Change', 'Yes', 'Left', 'No Change']
}
df = pd.DataFrame(data)

# Define a function to concatenate control items and determine control domain
def concatenate_control_items(group):
    control_items = '\n'.join(group['Record Type'] + ' ' + group['Record ID'].astype(str) + ' ' + group['Record Name'])
    record_types = group['Record Type'].unique()
    if len(record_types) > 1:
        control_domain = 'mix'
    elif 'EUC' in record_types:
        control_domain = 'EUC only'
    else:
        control_domain = 'Other'
    return pd.Series({
        'Control Items': control_items,
        'Control Domain': control_domain
    })

# Group by 'Staff ID' and apply the function
result_df = df.groupby('Staff ID').apply(concatenate_control_items).reset_index()

# Merge the result back to the original DataFrame to maintain original rows
final_df = df.drop(columns=['Record Type', 'Record ID', 'Record Name']).drop_duplicates().merge(result_df, on='Staff ID', how='left')

# Print the final DataFrame
print(final_df)

# Save the result to an Excel file if needed
final_df.to_excel('final_output.xlsx', index=False)


   Staff ID Position Changed                            Control Items  \
0       101             Left                              EUC 1 Name1   
1       102        No Change  EUC 2 Name2\nEUC 3 Name3\nOther 4 Name4   
2       102              Yes  EUC 2 Name2\nEUC 3 Name3\nOther 4 Name4   
3       102             Left  EUC 2 Name2\nEUC 3 Name3\nOther 4 Name4   
4       103        No Change                              EUC 5 Name5   

  Control Domain  
0       EUC only  
1            mix  
2            mix  
3            mix  
4       EUC only  


In [6]:
import tkinter as tk
from tkinter import filedialog, messagebox

class ExcelSheetSelector:
    def __init__(self, root):
        self.root = root
        self.root.title("Excel Sheet Selector")

        self.file_paths = [None] * 7
        self.labels = []

        # Specific labels for each file selection
        self.label_texts = [
            "Select Humans file:",
            "Select Animals file:",
            "Select Plants file:",
            "Select Minerals file:",
            "Select Microbes file:",
            "Select Fungi file:",
            "Select Algae file:"
        ]

        # Create labels and buttons for 7 Excel sheets
        for i in range(7):
            label = tk.Label(root, text=self.label_texts[i])
            label.grid(row=i, column=0, padx=10, pady=5)
            
            button = tk.Button(root, text="Browse", command=lambda i=i: self.select_file(i))
            button.grid(row=i, column=1, padx=10, pady=5)
            
            file_label = tk.Label(root, text="No file selected")
            file_label.grid(row=i, column=2, padx=10, pady=5)
            self.labels.append(file_label)
        
        # Button to check and print selected file paths
        self.print_button = tk.Button(root, text="Confirm Selections", command=self.check_and_confirm)
        self.print_button.grid(row=7, column=0, columnspan=3, pady=10)

    def select_file(self, index):
        self.root.withdraw()  # Hide the main window
        file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xls")])
        self.root.deiconify()  # Show the main window again
        if file_path:
            self.file_paths[index] = file_path
            self.labels[index].config(text=file_path)

    def check_and_confirm(self):
        if all(self.file_paths):
            self.root.destroy()  # Close the window gracefully
        else:
            messagebox.showwarning("Warning", "Please select all files before proceeding.")

# Create the main window
root = tk.Tk()
app = ExcelSheetSelector(root)
root.mainloop()

# After closing the window, capture the file paths into separate variables
humans_file, animals_file, plants_file, minerals_file, microbes_file, fungi_file, algae_file = app.file_paths

# Print the captured file paths (for verification)
print(f"Humans file: {humans_file}")
print(f"Animals file: {animals_file}")
print(f"Plants file: {plants_file}")
print(f"Minerals file: {minerals_file}")
print(f"Microbes file: {microbes_file}")
print(f"Fungi file: {fungi_file}")
print(f"Algae file: {algae_file}")


Humans file: C:/Users/KS/Start-Data-Analysis.xlsx
Animals file: C:/Users/KS/Source.xlsx
Plants file: C:/Users/KS/sample.xlsx
Minerals file: C:/Users/KS/sam.xlsx
Microbes file: C:/Users/KS/sf2_output.xlsx
Fungi file: C:/Users/KS/modified_workbook1.xlsx
Algae file: C:/Users/KS/modified_workbook.xlsx


In [1]:
import pandas as pd

# Sample data creation for demonstration
data = {
    'Record Type': ['Issue', 'Issue', 'Action', 'Issue', 'Action'],
    'Record ID': [101, 102, 103, 104, 105],
    'Record Name': ['A', 'B', 'C', 'D', 'E'],
    'Record Details': ['Detail1', 'Detail2', 'Detail3', 'Detail4', 'Detail5'],
    'Staff ID': [1, 1, 2, 1, 2],
    'Position Changed': ['Yes', 'No', 'Yes', 'No', 'Yes']
}

# Create DataFrame
df = pd.DataFrame(data)

# Define a function to concatenate the desired fields
def concatenate_records(group):
    return '\n'.join(group['Record Type'] + ' ' + group['Record ID'].astype(str) + ' ' + group['Record Name'])

# Group by 'Staff ID' and apply the concatenation function
grouped_df = df.groupby('Staff ID').apply(lambda x: pd.Series({
    'Control Items': concatenate_records(x),
    'Position Changed': x['Position Changed'].iloc[0]  # Assuming you want to keep the first 'Position Changed' value
})).reset_index()

# Optional: Include any other columns you want to keep from the original DataFrame
# For demonstration, I'm adding 'Record Details' concatenated in the same way
grouped_df['Record Details'] = df.groupby('Staff ID')['Record Details'].apply(lambda x: '\n'.join(x)).reset_index(drop=True)

# Display the final DataFrame
print(grouped_df)

# Save the final DataFrame to an Excel file
# grouped_df.to_excel('path_to_output_file.xlsx', index=False)


   Staff ID                          Control Items Position Changed  \
0         1  Issue 101 A\nIssue 102 B\nIssue 104 D              Yes   
1         2             Action 103 C\nAction 105 E              Yes   

              Record Details  
0  Detail1\nDetail2\nDetail4  
1           Detail3\nDetail5  


In [None]:
import pandas as pd

def lookup_and_merge(file1_path, file2_path, vOld_path):
    # Read the first file into a DataFrame
    df1 = pd.read_excel(file1_path)
    
    # Read the second file into a DataFrame
    df2 = pd.read_excel(file2_path)
    
    # Read the vOld file into a DataFrame
    vOld = pd.read_excel(vOld_path)
    
    # Ensure the Staff ID in both DataFrames are treated as numeric for accurate matching
    df1['Employee ID'] = pd.to_numeric(df1['Employee ID'], errors='coerce').astype('Int64')
    df2['Staff ID'] = pd.to_numeric(df2['Staff ID'], errors='coerce').astype('Int64')
    vOld['Staff ID'] = pd.to_numeric(vOld['Staff ID'], errors='coerce').astype('Int64')
    
    # Perform the lookup by merging the DataFrames on Staff ID and Employee ID
    merged_df = df2.merge(df1[['Employee ID', 'Position Changed']], 
                          left_on='Staff ID', right_on='Employee ID', 
                          how='left')
    
    # Drop the redundant 'Employee ID' column from the merged DataFrame
    merged_df.drop(columns=['Employee ID'], inplace=True)
    
    # Filter the DataFrame to retain only rows where 'Position Changed' is 'Left' or 'Yes'
    filtered_df = merged_df[merged_df['Position Changed'].isin(['Left', 'Yes'])]
    
    # Ensure 'Staff ID' and 'Position Changed' columns are filled properly
    filtered_df['Staff ID'] = filtered_df['Staff ID'].fillna('No ID')
    filtered_df['Position Changed'] = filtered_df['Position Changed'].fillna('No Change')
    
    # Perform the second lookup by merging the filtered_df with vOld on Staff ID
    final_df = filtered_df.merge(vOld[['Staff ID', 'Employee Name', 'Employee Business Email Address', 'Global Career Band']], 
                                 on='Staff ID', 
                                 how='left')
    
    # Rename the fetched columns
    final_df.rename(columns={
        'Employee Name': 'Functional Manager Employee Name',
        'Employee Business Email Address': 'Functional Manager Email',
        'Global Career Band': 'Functional Manager GCB'
    }, inplace=True)
    
    # Return the final DataFrame
    return final_df

# Paths to the Excel files
file1_path = 'path_to_file1.xlsx'  # File with Employee ID, Position Changed, etc.
file2_path = 'path_to_file2.xlsx'  # File with Record ID, Record Name, Record Details, Staff ID
vOld_path = 'path_to_vOld.xlsx'    # File with additional details based on Staff ID

# Perform the lookup, merge, and filter
final_df = lookup_and_merge(file1_path, file2_path, vOld_path)

# Display the final DataFrame
print(final_df)

# Save the final DataFrame to a new Excel file
final_df.to_excel('path_to_output_file.xlsx', index=False)


In [None]:
import pandas as pd

def lookup_and_merge(file1_path, file2_path):
    # Read the first file into a DataFrame
    df1 = pd.read_excel(file1_path)
    
    # Read the second file into a DataFrame
    df2 = pd.read_excel(file2_path)
    
    # Ensure the Staff ID in both DataFrames are strings for accurate matching
    df1['Employee ID'] = df1['Employee ID'].astype(str)
    df2['Staff ID'] = df2['Staff ID'].astype(str)
    
    # Perform the lookup by merging the DataFrames on Staff ID and Employee ID
    merged_df = df2.merge(df1[['Employee ID', 'Position Changed']], 
                          left_on='Staff ID', right_on='Employee ID', 
                          how='left')
    
    # Drop the redundant 'Employee ID' column from the merged DataFrame
    merged_df.drop(columns=['Employee ID'], inplace=True)
    
    # Return the merged DataFrame
    return merged_df

# Paths to the Excel files
file1_path = 'path_to_file1.xlsx'  # File with Employee ID, Position Changed, etc.
file2_path = 'path_to_file2.xlsx'  # File with Record ID, Record Name, Record Details, Staff ID

# Perform the lookup and merge
final_df = lookup_and_merge(file1_path, file2_path)

# Display the final DataFrame
print(final_df)

# If needed, save the final DataFrame to a new Excel file
# final_df.to_excel('path_to_output_file.xlsx', index=False)


In [None]:
import pandas as pd

def process_files(vnew_path, vold_path):
    # Read the Excel files into DataFrames
    vNew = pd.read_excel(vnew_path)
    vOld = pd.read_excel(vold_path)
    
    # Check and print initial dtypes of columns
    print("Initial dtypes of vNew:")
    print(vNew.dtypes)
    print("\nInitial dtypes of vOld:")
    print(vOld.dtypes)
    
    # Function to convert columns to int, skipping invalid rows
    def convert_to_int(df, column):
        df[column] = pd.to_numeric(df[column], errors='coerce')
        df.dropna(subset=[column], inplace=True)
        df[column] = df[column].astype(int)
        return df

    # Convert 'Employee ID' and 'Position Number' to integers
    vNew = convert_to_int(vNew, 'Employee ID')
    vNew = convert_to_int(vNew, 'Position Number')
    vOld = convert_to_int(vOld, 'Employee ID')
    vOld = convert_to_int(vOld, 'Position Number')
    
    # Check and print dtypes of columns after coercion attempt
    print("\nDtypes of vNew after coercion attempt:")
    print(vNew.dtypes)
    print("\nDtypes of vOld after coercion attempt:")
    print(vOld.dtypes)
    
    # Select necessary columns from vNew
    vNew_selected = vNew[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vNew_selected.rename(columns={
        'Position Number': 'new Position Number',
        'BF Level 4': 'new BF Level 4'
    }, inplace=True)
    
    # Select necessary columns from vOld
    vOld_selected = vOld[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vOld_selected.rename(columns={
        'Position Number': 'old Position Number',
        'BF Level 4': 'old BF Level 4'
    }, inplace=True)
    
    # Merge the DataFrames on Employee ID using left join
    merged_df = pd.merge(vOld_selected, vNew_selected, on='Employee ID', how='left')
    
    # Create the Position Changed column based on the specified conditions
    def determine_position_changed(row):
        if pd.isna(row['new Position Number']) and pd.isna(row['new BF Level 4']):
            return 'Left'
        elif (row['old Position Number'] != row['new Position Number']) or (row['old BF Level 4'] != row['new BF Level 4']):
            return 'Yes'
        else:
            return 'No Change'
    
    merged_df['Position Changed'] = merged_df.apply(determine_position_changed, axis=1)
    
    return merged_df

# Example usage
vnew_path = 'path_to_vNew.xlsx'
vold_path = 'path_to_vOld.xlsx'

final_df = process_files(vnew_path, vold_path)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd

def process_files(vnew_path, vold_path):
    # Read the Excel files into DataFrames
    vNew = pd.read_excel(vnew_path)
    vOld = pd.read_excel(vold_path)
    
    # Select necessary columns from vNew
    vNew_selected = vNew[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vNew_selected.rename(columns={
        'Position Number': 'new Position Number',
        'BF Level 4': 'new BF Level 4'
    }, inplace=True)
    
    # Select necessary columns from vOld
    vOld_selected = vOld[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vOld_selected.rename(columns={
        'Position Number': 'old Position Number',
        'BF Level 4': 'old BF Level 4'
    }, inplace=True)
    
    # Merge the DataFrames on Employee ID using left join
    merged_df = pd.merge(vOld_selected, vNew_selected, on='Employee ID', how='left')
    
    # Create the Position Changed column based on the specified conditions
    def determine_position_changed(row):
        if pd.isna(row['new Position Number']) and pd.isna(row['new BF Level 4']):
            return 'Left'
        elif (row['old Position Number'] != row['new Position Number']) or (row['old BF Level 4'] != row['new BF Level 4']):
            return 'Yes'
        else:
            return 'No Change'
    
    merged_df['Position Changed'] = merged_df.apply(determine_position_changed, axis=1)
    
    return merged_df

# Example usage
vnew_path = 'path_to_vNew.xlsx'
vold_path = 'path_to_vOld.xlsx'

final_df = process_files(vnew_path, vold_path)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd

def process_files(vnew_path, vold_path):
    # Read the Excel files into DataFrames
    vNew = pd.read_excel(vnew_path)
    vOld = pd.read_excel(vold_path)
    
    # Select necessary columns from vNew
    vNew_selected = vNew[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vNew_selected.rename(columns={
        'Position Number': 'new Position Number',
        'BF Level 4': 'new BF Level 4'
    }, inplace=True)
    
    # Select necessary columns from vOld
    vOld_selected = vOld[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vOld_selected.rename(columns={
        'Position Number': 'old Position Number',
        'BF Level 4': 'old BF Level 4'
    }, inplace=True)
    
    # Merge the DataFrames on Employee ID
    merged_df = pd.merge(vOld_selected, vNew_selected, on='Employee ID', how='outer')
    
    # Create the Position Changed column based on the specified conditions
    def determine_position_changed(row):
        if pd.isna(row['new Position Number']) and pd.isna(row['new BF Level 4']):
            return 'Left'
        elif (row['old Position Number'] != row['new Position Number']) or (row['old BF Level 4'] != row['new BF Level 4']):
            return 'Yes'
        else:
            return 'No Change'
    
    merged_df['Position Changed'] = merged_df.apply(determine_position_changed, axis=1)
    
    return merged_df

# Example usage
vnew_path = 'path_to_vNew.xlsx'
vold_path = 'path_to_vOld.xlsx'

final_df = process_files(vnew_path, vold_path)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd
import re

def extract_staff_id(action_owner_col):
    """Extract Staff ID from the given column using regex."""
    if isinstance(action_owner_col, str):
        match = re.search(r'\((\d{8})\)', action_owner_col)
        return match.group(1) if match else None
    return None

def process_single_file(file_path, staff_id_col, issue_status_col, filter_condition,
                        record_id_col, record_name_col, record_details_col, record_type_value):
    """
    Process a single file to extract relevant data and transform it into a standardized format.
    
    Parameters:
        file_path (str): Path to the input Excel file.
        staff_id_col (str): Column name for the Staff ID extraction.
        issue_status_col (str): Column name for filtering records.
        filter_condition (str): Value to filter the records (e.g., 'open').
        record_id_col (str): Column name for 'Record ID'.
        record_name_col (str): Column name for 'Record Name'.
        record_details_col (str): Column name for 'Record Details'.
        record_type_value (str): Value to be placed in 'Record Type' column.
    
    Returns:
        pd.DataFrame: Processed DataFrame with standardized columns.
    """
    # Load the Excel file into a DataFrame
    df = pd.read_excel(file_path)
    
    # Ensure the staff ID column is treated as string
    df[staff_id_col] = df[staff_id_col].astype(str)
    
    # Filter the DataFrame based on the provided condition
    filtered_df = df[df[issue_status_col] == filter_condition]
    
    # Extract Staff ID from the specified column
    filtered_df['Staff ID'] = filtered_df[staff_id_col].apply(extract_staff_id)
    
    # Add the 'Record Type' column with the specified value
    filtered_df['Record Type'] = record_type_value
    
    # Create the final DataFrame with the required columns
    result_df = filtered_df[[record_id_col, record_name_col, record_details_col]].copy()
    result_df.rename(columns={
        record_id_col: 'Record ID',
        record_name_col: 'Record Name',
        record_details_col: 'Record Details'
    }, inplace=True)
    
    # Add 'Staff ID' and 'Record Type' columns
    result_df['Staff ID'] = filtered_df['Staff ID']
    result_df['Record Type'] = filtered_df['Record Type']
    
    # Return the result DataFrame
    return result_df

def concatenate_files(file_params_list):
    """
    Process multiple files and concatenate the results into a single DataFrame.
    
    Parameters:
        file_params_list (list of dict): List of dictionaries with parameters for each file.
    
    Returns:
        pd.DataFrame: Concatenated DataFrame with results from all files.
    """
    all_dfs = []
    
    for params in file_params_list:
        df = process_single_file(
            file_path=params['file_path'],
            staff_id_col=params['staff_id_col'],
            issue_status_col=params['issue_status_col'],
            filter_condition=params['filter_condition'],
            record_id_col=params['record_id_col'],
            record_name_col=params['record_name_col'],
            record_details_col=params['record_details_col'],
            record_type_value=params['record_type_value']
        )
        all_dfs.append(df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    return final_df

# Example usage
file_params_list = [
    {
        'file_path': 'path_to_file1.xlsx',
        'staff_id_col': 'Action Owner',
        'issue_status_col': 'Issue Status',
        'filter_condition': 'open',
        'record_id_col': 'Issue ID',
        'record_name_col': 'Issue Title',
        'record_details_col': 'Issue Description',
        'record_type_value': 'Issue'
    },
    {
        'file_path': 'path_to_file2.xlsx',
        'staff_id_col': 'Owner',
        'issue_status_col': 'Status',
        'filter_condition': 'open',
        'record_id_col': 'ID',
        'record_name_col': 'Title',
        'record_details_col': 'Description',
        'record_type_value': 'Action Owner'
    },
    # Add parameters for other files here
]

# Process multiple files and get the final DataFrame
final_df = concatenate_files(file_params_list)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd
import re

def extract_staff_id(action_owner_col):
    """Extract Staff ID from the given column using regex."""
    if isinstance(action_owner_col, str):
        match = re.search(r'\((\d{8})\)', action_owner_col)
        return match.group(1) if match else None
    return None

def process_file(file_path, staff_id_col, issue_status_col, filter_condition,
                 record_id_col, record_name_col, record_details_col):
    """
    Process a single file to extract relevant data and transform it into a standardized format.
    
    Parameters:
        file_path (str): Path to the input Excel file.
        staff_id_col (str): Column name for the Staff ID extraction.
        issue_status_col (str): Column name for filtering records.
        filter_condition (str): Value to filter the records (e.g., 'open').
        record_id_col (str): Column name for 'Record ID'.
        record_name_col (str): Column name for 'Record Name'.
        record_details_col (str): Column name for 'Record Details'.
    
    Returns:
        pd.DataFrame: Processed DataFrame with standardized columns.
    """
    # Load the Excel file into a DataFrame
    df = pd.read_excel(file_path)
    
    # Ensure the staff ID column is treated as string
    df[staff_id_col] = df[staff_id_col].astype(str)
    
    # Filter the DataFrame based on the provided condition
    filtered_df = df[df[issue_status_col] == filter_condition]
    
    # Extract Staff ID from the specified column
    filtered_df['Staff ID'] = filtered_df[staff_id_col].apply(extract_staff_id)
    
    # Add the 'Record Type' column with the value 'Issue'
    filtered_df['Record Type'] = 'Issue'
    
    # Create the final DataFrame with the required columns
    result_df = filtered_df[[staff_id_col, record_id_col, record_name_col, record_details_col]].copy()
    result_df.rename(columns={
        staff_id_col: 'Staff ID',
        record_id_col: 'Record ID',
        record_name_col: 'Record Name',
        record_details_col: 'Record Details'
    }, inplace=True)
    
    # Add the 'Record Type' column
    result_df['Record Type'] = 'Issue'
    
    # Return the result DataFrame
    return result_df

def process_multiple_files(file_paths, column_mapping, filter_condition):
    """
    Process multiple files and concatenate the results into a single DataFrame.
    
    Parameters:
        file_paths (list of str): List of paths to input Excel files.
        column_mapping (list of dict): List of dictionaries with column mappings for each file.
        filter_condition (str): Value to filter records (e.g., 'open').
    
    Returns:
        pd.DataFrame: Concatenated DataFrame with results from all files.
    """
    all_dfs = []
    
    for file_path, columns in zip(file_paths, column_mapping):
        df = process_file(
            file_path,
            staff_id_col=columns['staff_id'],
            issue_status_col=columns['issue_status'],
            filter_condition=filter_condition,
            record_id_col=columns['record_id'],
            record_name_col=columns['record_name'],
            record_details_col=columns['record_details']
        )
        all_dfs.append(df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    return final_df

# Example usage
file_paths = [
    'path_to_file1.xlsx',
    'path_to_file2.xlsx',
    'path_to_file3.xlsx',
    'path_to_file4.xlsx',
    'path_to_file5.xlsx'
]

column_mapping = [
    {
        'staff_id': 'Action Owner',  # Column name to extract Staff ID
        'issue_status': 'Issue Status',  # Column name for issue status
        'record_id': 'Issue ID',  # Column name for Record ID
        'record_name': 'Issue Title',  # Column name for Record Name
        'record_details': 'Issue Description'  # Column name for Record Details
    },
    # Add mappings for other files here
]

filter_condition = 'open'  # Condition to filter records

# Process multiple files and get the final DataFrame
final_df = process_multiple_files(file_paths, column_mapping, filter_condition)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd
import re

# Function to extract Staff ID using regular expression
def extract_staff_id(action_owner):
    if isinstance(action_owner, str):
        match = re.search(r'\((\d{8})\)', action_owner)
        return match.group(1) if match else None
    return None

# Load the Excel file into a DataFrame
df = pd.read_excel('path_to_your_file.xlsx')

# Ensure "Action Owner" column is treated as string
df['Action Owner'] = df['Action Owner'].astype(str)

# Filter the DataFrame for rows where 'Issue Status' is 'open'
open_issues_df = df[df['Issue Status'] == 'open']

# Extract Staff ID from "Action Owner" column for filtered rows
open_issues_df['Staff ID'] = open_issues_df['Action Owner'].apply(extract_staff_id)

# Add the 'Record Type' column with value 'Issue'
open_issues_df['Record Type'] = 'Issue'

# Create the final DataFrame with the required columns
result_df = open_issues_df[['Staff ID', 'Record Type', 'Issue ID', 'Issue Title', 'Issue Description']]
result_df.rename(columns={
    'Issue ID': 'Record ID',
    'Issue Title': 'Record Name',
    'Issue Description': 'Record Details'
}, inplace=True)

# Display the final DataFrame
print(result_df)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Select relevant columns from vNew and vOld
vNew = vNew[['Employee ID', 'Position Number', 'Level4']]
vOld = vOld[['Employee ID', 'Position Number', 'Level4']]

# Merge vNew and vOld on Employee ID
merged_df = pd.merge(vNew, vOld, on='Employee ID', suffixes=('_new', '_old'), how='left')

# Rename columns to match the required format
merged_df.rename(columns={
    'Position Number_new': 'new Position Number',
    'Level4_new': 'new Level4',
    'Position Number_old': 'old Position Number',
    'Level4_old': 'old Level4'
}, inplace=True)

# Create the "Position Changed" column
merged_df['Position Changed'] = merged_df.apply(
    lambda row: 'New' if pd.isna(row['old Position Number']) or pd.isna(row['old Level4']) else 'No Change', axis=1)

# Fetch records from vOld where Employee ID is not in vNew
not_in_vNew = vOld[~vOld['Employee ID'].isin(vNew['Employee ID'])]

# Rename columns to match the format of merged_df
not_in_vNew.rename(columns={
    'Position Number': 'old Position Number',
    'Level4': 'old Level4'
}, inplace=True)

# Add new columns with default values for records not in vNew
not_in_vNew['new Position Number'] = None
not_in_vNew['new Level4'] = None
not_in_vNew['Position Changed'] = 'Left'

# Reorder columns to match the merged_df structure
not_in_vNew = not_in_vNew[['Employee ID', 'new Position Number', 'new Level4', 'old Position Number', 'old Level4', 'Position Changed']]

# Concatenate the two DataFrames
final_df = pd.concat([merged_df, not_in_vNew], ignore_index=True)

# Display the final DataFrame
print(final_df)


# Display the final DataFrame
print(merged_df)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Select relevant columns from vNew and vOld
vNew = vNew[['Employee ID', 'Position Number', 'Level4']]
vOld = vOld[['Employee ID', 'Position Number', 'Level4']]

# Merge vNew and vOld on Employee ID
merged_df = pd.merge(vNew, vOld, on='Employee ID', suffixes=('_new', '_old'), how='left')

# Rename columns to match the required format
merged_df.rename(columns={'Position Number_new': 'new Position Number',
                          'Level4_new': 'new Level4',
                          'Position Number_old': 'old Position Number',
                          'Level4_old': 'old Level4'}, inplace=True)

# Display the final DataFrame
print(merged_df)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Ensure column names are properly stripped of leading/trailing spaces
vNew.columns = vNew.columns.str.strip()
vOld.columns = vOld.columns.str.strip()

# Initialize new columns in vOld with default value 'left'
vOld['Pos Check'] = 'left'
vOld['BF4 Check'] = 'left'
vOld['Country Check'] = 'left'

# Perform the lookup and update the values for matching Employee Ids
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Pos Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Position Number'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'BF4 Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['BF Level 4 Name'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Country Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Work Location Country/Territory Name'])

# Create the 'In Scope' column
vOld['In Scope'] = ((vOld['Pos Check'] != vOld['Position Number']) | 
                    (vOld['BF4 Check'] != vOld['BF Level 4 Name']) | 
                    (vOld['Country Check'] != vOld['Work Location Country/Territory Name'])).apply(lambda x: 'Movement' if x else 'No Movement')

# Create 'Position Changed', 'BF Changed', and 'Country Changed' columns
vOld['Position Changed'] = vOld.apply(lambda row: 'Left' if row['Pos Check'] == 'left' else ('Yes' if row['Pos Check'] != row['Position Number'] else 'No'), axis=1)
vOld['BF Changed'] = vOld.apply(lambda row: 'Left' if row['BF4 Check'] == 'left' else ('Yes' if row['BF4 Check'] != row['BF Level 4 Name'] else 'No'), axis=1)
vOld['Country Changed'] = vOld.apply(lambda row: 'Left' if row['Country Check'] == 'left' else ('Yes' if row['Country Check'] != row['Work Location Country/Territory Name'] else 'No'), axis=1)

# Display the updated vOld DataFrame
print(vOld)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Ensure column names are properly stripped of leading/trailing spaces
vNew.columns = vNew.columns.str.strip()
vOld.columns = vOld.columns.str.strip()

# Initialize new columns in vOld with default value 'left'
vOld['Pos Check'] = 'left'
vOld['BF4 Check'] = 'left'
vOld['Country Check'] = 'left'

# Perform the lookup and update the values for matching Employee Ids
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Pos Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Position Number'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'BF4 Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['BF Level 4 Name'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Country Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Work Location Country/Territory Name'])

# Display the updated vOld DataFrame
print(vOld)


In [None]:
import pandas as pd

# Read the new and old data from the Excel workbooks
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Select the required columns
vNew = vNew[['Employee ID', 'Position Number', 'Level4']]
vOld = vOld[['Employee ID', 'Position Number', 'Level4']]

# Merge the dataframes on Employee ID
merged_df = pd.merge(vNew, vOld, on='Employee ID', how='outer', suffixes=('_new', '_old'))

# Create the Status column
merged_df['Status'] = merged_df.apply(
    lambda row: 'new' if pd.isna(row['Position Number_old']) and not pd.isna(row['Position Number_new']) else 
                ('left' if pd.isna(row['Position Number_new']) and not pd.isna(row['Position Number_old']) else 
                'existing'), axis=1)

# Save the resulting dataframe to a new Excel file
merged_df.to_excel('path_to_output.xlsx', index=False)


In [None]:
import pandas as pd
from openpyxl import Workbook, load_workbook

# Read the Excel file
file_path = 'your_excel_file.xlsx'  # Replace with your file path
output_file_path = 'grouped_data_with_subtables.xlsx'  # Replace with your desired output file path
excel_data = pd.ExcelFile(file_path)

# Create a Pandas Excel writer object
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    for sheet_name in excel_data.sheet_names:
        # Read each sheet into a DataFrame
        df = pd.read_excel(excel_data, sheet_name=sheet_name)

        # Group the data by 'Mapped L3' and 'Role_temp'
        grouped = df.groupby(['Mapped L3', 'Role_temp'])

        # Count the occurrences of 'High' and 'Low' in the 'High/Low' column
        result = grouped['High/Low'].value_counts().unstack(fill_value=0).fillna(0)

        # Add totals for each row ('Mapped L3', 'Role_temp')
        result['Total'] = result.sum(axis=1)

        # Add a totals row
        total_row = result.sum(axis=0).to_frame().T
        total_row.index = pd.MultiIndex.from_tuples([('Total', '')])
        result = pd.concat([result, total_row])

        # Reset the index to get 'Mapped L3' and 'Role_temp' as columns
        result.reset_index(inplace=True)

        # Ensure 'High' and 'Low' are present, if not add them with default 0 values
        if 'High' not in result.columns:
            result['High'] = 0
        if 'Low' not in result.columns:
            result['Low'] = 0
        result = result[['Mapped L3', 'Role_temp', 'High', 'Low', 'Total']]

        # Write the grouped data to the sheet
        result.to_excel(writer, sheet_name=sheet_name, startrow=0, index=False)
        
        # Load the workbook and access the worksheet
        workbook = writer.book
        worksheet = workbook[sheet_name]

        # Initialize start row for writing subtables
        start_row = len(result) + 2  # Move to the next row after grouped data

        # Create a list to store all subtables
        subtables_list = []

        # Create subtables for each unique 'Mapped L3'
        unique_mapped_l3 = df['Mapped L3'].unique()
        for mapped_l3 in unique_mapped_l3:
            # Skip 'Total' rows if present
            if mapped_l3 == 'Total':
                continue

            # Filter the original DataFrame based on 'Mapped L3'
            mapped_l3_data = df[df['Mapped L3'] == mapped_l3]

            # Insert a heading for unique 'Mapped L3'
            mapped_l3_heading = f"Unique L3 Value: {mapped_l3}"
            subtables_list.append((mapped_l3_heading, None))  # Append heading to list

            # Iterate over each unique 'Role_temp' for the current 'Mapped L3'
            unique_role_temp = mapped_l3_data['Role_temp'].unique()
            for role_temp in unique_role_temp:
                # Filter the data for the current 'Mapped L3' and 'Role_temp'
                subtable_data = mapped_l3_data[mapped_l3_data['Role_temp'] == role_temp]

                # Insert a heading for 'Role_temp'
                role_temp_heading = f"Role Type: {role_temp}"
                subtables_list.append((role_temp_heading, None))  # Append heading to list

                # Append the subtable data to the list
                subtables_list.append((None, subtable_data))

        # Write subtables to Excel with proper gaps
        for item in subtables_list:
            if item[0]:  # If it's a heading
                worksheet.cell(row=start_row, column=1, value=item[0])
                start_row += 2  # 2-line gap before next heading or subtable
            elif item[1] is not None:  # If it's subtable data
                item[1].to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False, header=True)
                start_row += len(item[1]) + 1  # 1-line gap after subtable

print(f"Grouped data with subtables has been saved to {output_file_path}")


In [None]:
import pandas as pd

# Read the Excel file
file_path = 'your_excel_file.xlsx'  # Replace with your file path
output_file_path = 'grouped_data_with_subtables.xlsx'  # Replace with your desired output file path
excel_data = pd.ExcelFile(file_path)

# Create a Pandas Excel writer object
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    for sheet_name in excel_data.sheet_names:
        # Read each sheet into a DataFrame
        df = pd.read_excel(excel_data, sheet_name=sheet_name)

        # Group the data by 'Mapped L3' and 'Role_temp'
        grouped = df.groupby(['Mapped L3', 'Role_temp'])

        # Count the occurrences of 'High' and 'Low' in the 'High/Low' column
        result = grouped['High/Low'].value_counts().unstack(fill_value=0).fillna(0)

        # Add totals for each row ('Mapped L3', 'Role_temp')
        result['Total'] = result.sum(axis=1)

        # Add a totals row
        total_row = result.sum(axis=0).to_frame().T
        total_row.index = pd.MultiIndex.from_tuples([('Total', '')])
        result = pd.concat([result, total_row])

        # Reset the index to get 'Mapped L3' and 'Role_temp' as columns
        result.reset_index(inplace=True)

        # Ensure 'High' and 'Low' are present, if not add them with default 0 values
        if 'High' not in result.columns:
            result['High'] = 0
        if 'Low' not in result.columns:
            result['Low'] = 0
        result = result[['Mapped L3', 'Role_temp', 'Total', 'High', 'Low']]

        # Initialize start row for writing grouped data
        start_row = 0

        # Write the grouped data to the sheet
        result.to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False)
        start_row += len(result) + 2  # Move to the next row after grouped data

        # Create subtables for each unique 'Mapped L3'
        unique_mapped_l3 = df['Mapped L3'].unique()
        for mapped_l3 in unique_mapped_l3:
            # Skip 'Total' rows if present
            if mapped_l3 == 'Total':
                continue

            # Filter the original DataFrame based on 'Mapped L3'
            mapped_l3_data = df[df['Mapped L3'] == mapped_l3]

            # Insert a heading for unique 'Mapped L3'
            mapped_l3_heading = f"Unique L3 Value: {mapped_l3}"
            worksheet = writer.sheets[sheet_name]
            worksheet.cell(row=start_row, column=1, value=mapped_l3_heading)

            # Move to the next row after writing unique L3 heading
            start_row += 2

            # Iterate over each unique 'Role_temp' for the current 'Mapped L3'
            unique_role_temp = mapped_l3_data['Role_temp'].unique()
            for role_temp in unique_role_temp:
                # Filter the data for the current 'Mapped L3' and 'Role_temp'
                subtable_data = mapped_l3_data[mapped_l3_data['Role_temp'] == role_temp]

                # Insert a heading for 'Role_temp'
                role_temp_heading = f"Role Type: {role_temp}"
                worksheet.cell(row=start_row, column=1, value=role_temp_heading)

                # Move to the next row after writing role_temp heading
                start_row += 2

                # Insert an empty row to separate headings from subtable data
                worksheet.cell(row=start_row, column=1, value="")

                # Move to the next row after inserting empty row
                start_row += 1

                # Write the subtable data to the sheet
                subtable_data.to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False)

                # Move to the next row after writing subtable data
                start_row += len(subtable_data) + 2  # Add extra space after each subtable

print(f"Grouped data with subtables has been saved to {output_file_path}")


In [12]:
import pandas as pd

# Sample data for df1 with 30 columns (only a few shown here for simplicity)
data1 = {
    'colA': [1, 2, 3, 4, 7],
    'colB': [10, 20, 30, 40, 50],
    'colS': [None, None, None, None, None]  # Initially None or some default value
    # Add other columns as needed
}
df1 = pd.DataFrame(data1)

# Sample data for df2 with 45 columns (only a few shown here for simplicity)
data2 = {
    'colR': [3, 3, 4, 5, 6, 6],
    'colS': ['a', 'b', 'c', 'd', 'e', 'f'],
    'colPM': [7, 8, 9, 10, 11, 12]
    # Add other columns as needed
}
df2 = pd.DataFrame(data2)

# Ensure the data types of colA, colR, and colPM are the same
df1['colA'] = df1['colA'].astype(str)
df2['colR'] = df2['colR'].astype(str)
df2['colPM'] = df2['colPM'].astype(str)

# Initial merge using colA and colR
df_merged_initial = pd.merge(df1, df2[['colR', 'colS']], left_on='colA', right_on='colR', how='left')

# Update df1's colS with the values from initial merge
df1['colS'] = df_merged_initial['colS_y']

# Identify rows where colS is still NaN
mask = df1['colS'].isna()

# Perform the second merge using colA and colPM for the NaN rows
df_merged_second = pd.merge(df1[mask], df2[['colPM', 'colS']], left_on='colA', right_on='colPM', how='left')

# Update colS in df1 for the NaN rows with values from the second merge
df1.loc[mask, 'colS'] = df_merged_second['colS_y']

print(df1)


  colA  colB colS
0    1    10  NaN
1    2    20  NaN
2    3    30    a
3    4    40    b
4    7    50    c


In [14]:
import pandas as pd

# Sample data for df1 with 30 columns (only a few shown here for simplicity)
data1 = {
    'colA': [1, 2, 3, 4, 7],
    'colB': [10, 20, 30, 40, 50],
    'colS': [None, '', ' ', '   ', '']  # Different types of empty or blank values
    # Add other columns as needed
}
df1 = pd.DataFrame(data1)

# Sample data for df2 with 45 columns (only a few shown here for simplicity)
data2 = {
    'colR': [3, 3, 4, 5, 6, 6],
    'colS': ['a', 'b', 'c', 'd', 'e', 'f'],
    'colPM': [7, 8, 9, 10, 11, 12]
    # Add other columns as needed
}
df2 = pd.DataFrame(data2)

# Ensure the data types of colA, colR, and colPM are the same
df1['colA'] = df1['colA'].astype(str)
df2['colR'] = df2['colR'].astype(str)
df2['colPM'] = df2['colPM'].astype(str)

# Initial merge using colA and colR
df_merged_initial = pd.merge(df1, df2[['colR', 'colS']], left_on='colA', right_on='colR', how='left')

# Update df1's colS with the values from initial merge
df1['colS'] = df_merged_initial['colS_y']

# Identify rows where colS is empty or blank
mask = df1['colS'].apply(lambda x: x == '' or x.isspace() if isinstance(x, str) else False)

# Perform the second merge using colA and colPM for the empty or blank rows
df_merged_second = pd.merge(df1[mask], df2[['colPM', 'colS']], left_on='colA', right_on='colPM', how='left')

# Update colS in df1 for the empty or blank rows with values from the second merge
df1.loc[mask, 'colS'] = df_merged_second['colS_y']

print(df1)


  colA  colB colS
0    1    10  NaN
1    2    20  NaN
2    3    30    a
3    4    40    b
4    7    50    c


In [6]:
df_merged_pm

Unnamed: 0,colA,colB,colS_x,colPM,colS_y
0,1,10,,,
1,2,20,,,
2,7,50,,7.0,a


In [1]:
import pandas as pd

# Sample data for df1 with 30 columns (only a few shown here for simplicity)
data1 = {
    'colA': [1, 2, 3, 4],
    'colB': [10, 20, 30, 40],
    # Add other columns as needed
}
df1 = pd.DataFrame(data1)

# Sample data for df2 with 45 columns (only a few shown here for simplicity)
data2 = {
    'colR': [3, 3, 4, 5, 6, 6],
    'colS': ['a', 'b', 'c', 'd', 'e', 'f'],
    # Add other columns as needed
}
df2 = pd.DataFrame(data2)

# Ensure the data types of colA and colR are the same
df1['colA'] = df1['colA'].astype(str)
df2['colR'] = df2['colR'].astype(str)

# Select only the required columns from df2
df2_selected = df2[['colR', 'colS']].drop_duplicates(subset='colR')

# Merge df1 with df2_selected on colA and colR
df_merged = pd.merge(df1, df2_selected, left_on='colA', right_on='colR', how='left')

# Assign the values to the new column in df1 and drop any extra columns
df1['colS1'] = df_merged['colS']

# Drop the extra merge column if needed (not strictly necessary, but clean)
df1.drop(columns=['colR'], inplace=True, errors='ignore')

print(df1)


  colA  colB colS1
0    1    10   NaN
1    2    20   NaN
2    3    30     a
3    4    40     c


In [1]:
import pandas as pd

# Sample data for df1
data1 = {'colA': [1, 2, 3, 4]}
df1 = pd.DataFrame(data1)

# Sample data for df2 with non-unique colR values
data2 = {'colR': [3, 3, 4, 5, 6, 6], 'colS': ['a', 'b', 'c', 'd', 'e', 'f']}
df2 = pd.DataFrame(data2)

# Merge df1 with df2 on the condition that df1['colA'] matches df2['colR']
df_merged = pd.merge(df1, df2, left_on='colA', right_on='colR', how='left')

# Select only relevant columns and rename them
df1['colS1'] = df_merged['colS']

print(df1)


   colA colS1
0     1   NaN
1     2   NaN
2     3     a
3     4     b


In [2]:
import pandas as pd

# Sample DataFrames
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'colA': ['A1', 'A2', 'A3', 'A4', 'A5'],
    'colB': ['B1', 'B2', 'B3', 'B4', 'B5'],
    'colC': ['C1', 'C2', 'C3', 'C4', 'C5']
})

df2 = pd.DataFrame({
    'id': [6, 7, 8],
    'colX': ['X6', 'X7', 'X8'],
    'colY': ['Y6', 'Y7', 'Y8'],
    'colZ': ['Z6', 'Z7', 'Z8']
})

# Columns to copy from df1 and their corresponding columns in df2
columns_to_copy = {
    'id': 'id',
    'colB': 'colY',
    'colC': 'colZ'
}

# Create a new DataFrame with the selected columns from df1
new_rows = df1[list(columns_to_copy.keys())].copy()

# Rename the columns in the new DataFrame to match the column names in df2
new_rows.rename(columns=columns_to_copy, inplace=True)

# Append the new DataFrame to df2
df2 = df2.append(new_rows, ignore_index=True)

# Display the updated df2
print(df2)


   id colX colY colZ
0   6   X6   Y6   Z6
1   7   X7   Y7   Z7
2   8   X8   Y8   Z8
3   1  NaN   B1   C1
4   2  NaN   B2   C2
5   3  NaN   B3   C3
6   4  NaN   B4   C4
7   5  NaN   B5   C5
