In [None]:
# Iterate through the current month's data
for index, current_record in current_df.iterrows():
    # Generate the combined emp_pos_ID for current and next data
    current_emp_pos = current_record["emp_pos_ID"]
    next_record = next_df[next_df["emp_pos_ID"] == current_emp_pos]

    # If exact emp_pos_ID doesn't exist in the next month
    if next_record.empty:
        # Check if the Employee ID exists but with a different Position ID
        employee_matches = next_df[next_df["Employee ID"] == current_record["Employee ID"]]
        if not employee_matches.empty:
            # Mobility case: Employee has changed Position ID
            new_pos_id = employee_matches.iloc[0]["Position ID"]
            changes.append({
                "Position ID": current_record["Position ID"],
                "Employee ID": current_record["Employee ID"],
                "Month Changed": month,
                "Change Description": f"Mobility: Position ID changed to {new_pos_id}",
                "Source": "GHA",
            })
            continue

        # Employee ID does not exist in the next month
        changes.append({
            "Position ID": current_record["Position ID"],
            "Employee ID": current_record["Employee ID"],
            "Month Changed": month,
            "Change Description": "Employee ID does not exist, movement out",
            "Source": "GHA",
        })
        continue

    # Convert to a single next record for further comparison
    next_record = next_record.iloc[0]

    # Initialize the change description list
    change_desc = []

    # Compare other important columns for changes
    for col in comparison_columns:
        if pd.notna(current_record[col]) and pd.notna(next_record[col]) and current_record[col] != next_record[col]:
            change_desc.append(f"{col} changed")

    # Append to changes list if any differences are found
    if change_desc:
        changes.append({
            "Position ID": current_record["Position ID"],
            "Employee ID": current_record["Employee ID"],
            "Month Changed": month,
            "Change Description": "; ".join(change_desc),
            "Source": "GHA",
        })


In [None]:
def standardize_columns(df, source_type):
    # Standardize column names
    df = df.rename(columns={
        "Position Number": "Position ID",
        "Employee Global Career Band": "Global Career Band"  # For open positions
    })
    
    # Clean Position ID and Employee ID
    if "Position ID" in df.columns:
        df["Position ID"] = df["Position ID"].astype(str).str.replace(r"\.0$", "", regex=True).str.strip()
    if "Employee ID" in df.columns:
        df["Employee ID"] = df["Employee ID"].astype(str).str.replace(r"\.0$", "", regex=True).str.strip()
    
    # Add missing columns if not present
    required_cols = ["Position ID", "Employee ID"]  # Add more as needed
    for col in required_cols:
        if col not in df.columns:
            df[col] = ""
    
    return df

def find_changes_gha(current_df, next_df, cols_to_check, month):
    changes = []
    
    # Standardize column names
    current_df = standardize_columns(current_df, "GHA")
    next_df = standardize_columns(next_df, "GHA")
    
    # Concatenate Employee ID and Position ID to create unique identifiers
    current_df["Emp_Pos_ID"] = current_df["Employee ID"] + "_" + current_df["Position ID"]
    next_df["Emp_Pos_ID"] = next_df["Employee ID"] + "_" + next_df["Position ID"]

    # Compare based on Emp_Pos_ID
    current_set = set(current_df["Emp_Pos_ID"])
    next_set = set(next_df["Emp_Pos_ID"])
    
    # Identify records in current_df but not in next_df
    for emp_pos_id in current_set - next_set:
        record = current_df[current_df["Emp_Pos_ID"] == emp_pos_id].iloc[0]
        employee_id = record["Employee ID"]
        
        # Check if the Employee ID exists in next_df
        if employee_id not in next_df["Employee ID"].values:
            changes.append({
                "Position ID": record["Position ID"],
                "Employee ID": employee_id,
                "Month Changed": month,
                "Change Description": "Employee ID does not exist, movement out",
                "Source": "GHA"
            })
    
    # Identify records in both but with changes in other columns
    common_ids = current_set & next_set
    for emp_pos_id in common_ids:
        current_record = current_df[current_df["Emp_Pos_ID"] == emp_pos_id].iloc[0]
        next_record = next_df[next_df["Emp_Pos_ID"] == emp_pos_id].iloc[0]
        
        # Check for changes in important columns
        change_desc = []
        for col in cols_to_check:
            if col in current_record and col in next_record:
                if current_record[col] != next_record[col]:
                    change_desc.append(f"{col} changed")
        
        if change_desc:
            changes.append({
                "Position ID": current_record["Position ID"],
                "Employee ID": current_record["Employee ID"],
                "Month Changed": month,
                "Change Description": "; ".join(change_desc),
                "Source": "GHA"
            })
    
    return pd.DataFrame(changes)

def find_changes_open_positions(current_df, next_df, cols_to_check, month):
    changes = []
    
    # Standardize column names for open positions
    current_df = standardize_columns(current_df, "Open Positions")
    next_df = standardize_columns(next_df, "Open Positions")
    
    # Compare based on Position ID
    current_set = set(current_df["Position ID"])
    next_set = set(next_df["Position ID"])
    
    # Identify records in current_df but not in next_df
    for pos_id in current_set - next_set:
        record = current_df[current_df["Position ID"] == pos_id].iloc[0]
        changes.append({
            "Position ID": pos_id,
            "Month Changed": month,
            "Change Description": "Position ID removed",
            "Source": "Open Positions"
        })
    
    # Identify records in both but with changes in other columns
    common_ids = current_set & next_set
    for pos_id in common_ids:
        current_record = current_df[current_df["Position ID"] == pos_id].iloc[0]
        next_record = next_df[next_df["Position ID"] == pos_id].iloc[0]
        
        # Check for changes in important columns
        change_desc = []
        for col in cols_to_check:
            if col in current_record and col in next_record:
                if current_record[col] != next_record[col]:
                    change_desc.append(f"{col} changed")
        
        if change_desc:
            changes.append({
                "Position ID": pos_id,
                "Month Changed": month,
                "Change Description": "; ".join(change_desc),
                "Source": "Open Positions"
            })
    
    return pd.DataFrame(changes)

def process_monthly_files(monthly_files, cols_to_check_gha, cols_to_check_open_pos):
    changes_df = pd.DataFrame()
    
    months = list(monthly_files.keys())
    for i in range(len(months) - 1):
        current_month = months[i]
        next_month = months[i + 1]
        
        gha_current_file, open_pos_current_file = monthly_files[current_month]
        gha_next_file, open_pos_next_file = monthly_files[next_month]
        
        # Read files (assuming single sheet for simplicity)
        gha_current_df = pd.read_excel(gha_current_file, sheet_name="Headcount - Employee Detail")
        gha_next_df = pd.read_excel(gha_next_file, sheet_name="Headcount - Employee Detail")
        open_pos_current_df = pd.read_excel(open_pos_current_file)
        open_pos_next_df = pd.read_excel(open_pos_next_file)
        
        # Process GHA
        gha_changes = find_changes_gha(gha_current_df, gha_next_df, cols_to_check_gha, next_month)
        changes_df = pd.concat([changes_df, gha_changes], ignore_index=True)
        
        # Process Open Positions (use your existing logic here)
        open_pos_changes = find_changes_open_positions(open_pos_current_df, open_pos_next_df, cols_to_check_open_pos, next_month)
        changes_df = pd.concat([changes_df, open_pos_changes], ignore_index=True)
    
    return changes_df

# Example usage
monthly_files = {
    "Jan 24": ("gha_jan.xlsx", "open_pos_jan.xlsx"),
    "Feb 24": ("gha_feb.xlsx", "open_pos_feb.xlsx"),
    # Add other months...
}

cols_to_check_gha = ["Global Career Band", "Employee FTE", "Employee Type", "Work Location City"]
cols_to_check_open_pos = ["Global Career Band", "Work Location City"]

changes_df = process_monthly_files(monthly_files, cols_to_check_gha, cols_to_check_open_pos)
changes_df.to_excel("changes_output.xlsx", index=False)


In [None]:
def standardize_columns(df, source_type):
    # Standardize column names
    df = df.rename(columns={
        "Position Number": "Position ID",
        "Employee Global Career Band": "Global Career Band"  # For open positions
    })
    
    # Clean Position ID and Employee ID
    if "Position ID" in df.columns:
        df["Position ID"] = df["Position ID"].astype(str).str.replace(r"\.0$", "", regex=True).str.strip()
    if "Employee ID" in df.columns:
        df["Employee ID"] = df["Employee ID"].astype(str).str.replace(r"\.0$", "", regex=True).str.strip()
    
    # Add missing columns if not present
    required_cols = ["Position ID", "Employee ID"]  # Add more as needed
    for col in required_cols:
        if col not in df.columns:
            df[col] = ""
    
    return df

def find_changes_gha(current_df, next_df, cols_to_check, month):
    changes = []
    
    # Standardize column names
    current_df = standardize_columns(current_df, "GHA")
    next_df = standardize_columns(next_df, "GHA")
    
    # Concatenate Employee ID and Position ID to create unique identifiers
    current_df["Emp_Pos_ID"] = current_df["Employee ID"] + "_" + current_df["Position ID"]
    next_df["Emp_Pos_ID"] = next_df["Employee ID"] + "_" + next_df["Position ID"]

    # Compare based on Emp_Pos_ID
    current_set = set(current_df["Emp_Pos_ID"])
    next_set = set(next_df["Emp_Pos_ID"])
    
    # Identify records in current_df but not in next_df
    for emp_pos_id in current_set - next_set:
        record = current_df[current_df["Emp_Pos_ID"] == emp_pos_id].iloc[0]
        employee_id = record["Employee ID"]
        
        # Check if the Employee ID exists in next_df
        if employee_id not in next_df["Employee ID"].values:
            changes.append({
                "Position ID": record["Position ID"],
                "Employee ID": employee_id,
                "Month Changed": month,
                "Change Description": "Employee ID does not exist, movement out",
                "Source": "GHA"
            })
    
    # Identify records in both but with changes in other columns
    common_ids = current_set & next_set
    for emp_pos_id in common_ids:
        current_record = current_df[current_df["Emp_Pos_ID"] == emp_pos_id].iloc[0]
        next_record = next_df[next_df["Emp_Pos_ID"] == emp_pos_id].iloc[0]
        
        # Check for changes in important columns
        change_desc = []
        for col in cols_to_check:
            if col in current_record and col in next_record:
                if current_record[col] != next_record[col]:
                    change_desc.append(f"{col} changed")
        
        if change_desc:
            changes.append({
                "Position ID": current_record["Position ID"],
                "Employee ID": current_record["Employee ID"],
                "Month Changed": month,
                "Change Description": "; ".join(change_desc),
                "Source": "GHA"
            })
    
    return pd.DataFrame(changes)

def process_monthly_files(monthly_files, cols_to_check_gha, cols_to_check_open_pos):
    changes_df = pd.DataFrame()
    
    months = list(monthly_files.keys())
    for i in range(len(months) - 1):
        current_month = months[i]
        next_month = months[i + 1]
        
        gha_current_file, open_pos_current_file = monthly_files[current_month]
        gha_next_file, open_pos_next_file = monthly_files[next_month]
        
        # Read files (assuming single sheet for simplicity)
        gha_current_df = pd.read_excel(gha_current_file, sheet_name="Headcount - Employee Detail")
        gha_next_df = pd.read_excel(gha_next_file, sheet_name="Headcount - Employee Detail")
        open_pos_current_df = pd.read_excel(open_pos_current_file)
        open_pos_next_df = pd.read_excel(open_pos_next_file)
        
        # Process GHA
        gha_changes = find_changes_gha(gha_current_df, gha_next_df, cols_to_check_gha, next_month)
        changes_df = pd.concat([changes_df, gha_changes], ignore_index=True)
        
        # Process Open Positions (unchanged)
        open_pos_changes = find_changes(open_pos_current_df, open_pos_next_df, cols_to_check_open_pos, next_month, "Open Positions")
        changes_df = pd.concat([changes_df, open_pos_changes], ignore_index=True)
    
    return changes_df

# Example usage
monthly_files = {
    "Jan 24": ("gha_jan.xlsx", "open_pos_jan.xlsx"),
    "Feb 24": ("gha_feb.xlsx", "open_pos_feb.xlsx"),
    # Add other months...
}

cols_to_check_gha = ["Global Career Band", "Employee FTE", "Employee Type", "Work Location City"]
cols_to_check_open_pos = ["Global Career Band", "Work Location City"]

changes_df = process_monthly_files(monthly_files, cols_to_check_gha, cols_to_check_open_pos)
changes_df.to_excel("changes_output.xlsx", index=False)


In [None]:
import pandas as pd

def standardize_columns(df, source_type):
    # Standardize column names
    df = df.rename(columns={
        "Position Number": "Position ID",
        "Employee Global Career Band": "Global Career Band"  # For open positions
    })
    
    # Clean Position ID and Employee ID
    if "Position ID" in df.columns:
        df["Position ID"] = df["Position ID"].astype(str).str.replace(r"\.0$", "", regex=True).str.strip()
    if "Employee ID" in df.columns:
        df["Employee ID"] = df["Employee ID"].astype(str).str.replace(r"\.0$", "", regex=True).str.strip()
    
    # Add missing columns if not present
    required_cols = ["Position ID", "Employee ID"]  # Add more as needed
    for col in required_cols:
        if col not in df.columns:
            df[col] = ""
    
    return df

def find_changes(current_df, next_df, cols_to_check, month, source_type):
    changes = []
    
    # Standardize column names
    current_df = standardize_columns(current_df, source_type)
    next_df = standardize_columns(next_df, source_type)

    # Merge on Position ID (and Employee ID if GHA)
    merge_cols = ["Position ID"]
    if source_type == "GHA":
        merge_cols.append("Employee ID")
    
    merged = pd.merge(current_df, next_df, on=merge_cols, how="outer", suffixes=("_current", "_next"), indicator=True)
    
    for _, row in merged.iterrows():
        if row["_merge"] == "right_only":
            # New record in the next month, skip it
            continue
        
        if row["_merge"] == "left_only":
            # Position/Employee no longer present
            continue
        
        # Check for changes in important columns
        change_desc = []
        for col in cols_to_check:
            col_current = f"{col}_current"
            col_next = f"{col}_next"
            if col_current in row and col_next in row and row[col_current] != row[col_next]:
                change_desc.append(f"{col} changed")
        
        if change_desc:
            changes.append({
                "Position ID": row["Position ID"],
                "Employee ID": row.get("Employee ID", ""),
                "Month Changed": month,
                "Change Description": "; ".join(change_desc),
                "Source": source_type
            })
    
    return pd.DataFrame(changes)

def process_monthly_files(monthly_files, cols_to_check_gha, cols_to_check_open_pos):
    changes_df = pd.DataFrame()
    
    months = list(monthly_files.keys())
    for i in range(len(months) - 1):
        current_month = months[i]
        next_month = months[i + 1]
        
        gha_current_file, open_pos_current_file = monthly_files[current_month]
        gha_next_file, open_pos_next_file = monthly_files[next_month]
        
        # Read files (assuming single sheet for simplicity)
        gha_current_df = pd.read_excel(gha_current_file, sheet_name="Headcount - Employee Detail")
        gha_next_df = pd.read_excel(gha_next_file, sheet_name="Headcount - Employee Detail")
        open_pos_current_df = pd.read_excel(open_pos_current_file)
        open_pos_next_df = pd.read_excel(open_pos_next_file)
        
        # Process GHA
        gha_changes = find_changes(gha_current_df, gha_next_df, cols_to_check_gha, next_month, "GHA")
        changes_df = pd.concat([changes_df, gha_changes], ignore_index=True)
        
        # Process Open Positions
        open_pos_changes = find_changes(open_pos_current_df, open_pos_next_df, cols_to_check_open_pos, next_month, "Open Positions")
        changes_df = pd.concat([changes_df, open_pos_changes], ignore_index=True)
    
    return changes_df

# Example usage
monthly_files = {
    "Jan 24": ("gha_jan.xlsx", "open_pos_jan.xlsx"),
    "Feb 24": ("gha_feb.xlsx", "open_pos_feb.xlsx"),
    # Add other months...
}

cols_to_check_gha = ["Global Career Band", "Employee FTE", "Employee Type", "Work Location City"]
cols_to_check_open_pos = ["Global Career Band", "Work Location City"]

changes_df = process_monthly_files(monthly_files, cols_to_check_gha, cols_to_check_open_pos)
changes_df.to_excel("changes_output.xlsx", index=False)


In [None]:
import pandas as pd

def standardize_ids(df):
    """
    Standardize 'Position ID' and 'Emp`loyee ID' columns.
    Removes extra decimal points, leading/trailing spaces, 
    and converts IDs to string format with uniform length.
    """
    if "Position ID" in df.columns:
        df["Position ID"] = df["Position ID"].astype(str).str.split('.').str[0].str.strip()
    if "Employee ID" in df.columns:
        df["Employee ID"] = df["Employee ID"].astype(str).str.split('.').str[0].str.strip()
    return df

def find_new_records(file_df, master_df, month, source):
    """
    Find new records in the current month's file compared to the master dataframe.
    """
    # Standardize IDs for comparison
    file_df = standardize_ids(file_df)
    master_df = standardize_ids(master_df)

    # Add missing columns to file_df to match master_df
    for col in master_df.columns:
        if col not in file_df.columns:
            file_df[col] = ""

    # Drop columns not in master_df
    file_df = file_df[master_df.columns]

    # Identify new records (not in master_df)
    new_records = file_df[~file_df["Position ID"].isin(master_df["Position ID"])]
    new_records["Month Added"] = month
    new_records["Source"] = source

    # Ensure new_records matches the structure of new_df
    new_records = new_records.reindex(columns=master_df.columns.tolist() + ["Month Added", "Source"], fill_value="")

    return new_records

def find_changed_records(current_df, next_df, cols_to_check, month, source):
    """
    Find records with changes in the next month's file compared to the current month's file.
    """
    # Standardize IDs for comparison
    current_df = standardize_ids(current_df)
    next_df = standardize_ids(next_df)

    # Merge both dataframes on 'Position ID' and 'Employee ID' for comparison
    merged = pd.merge(current_df, next_df, on=["Position ID", "Employee ID"], suffixes=("_current", "_next"), how="inner")

    changed_records = []
    for _, row in merged.iterrows():
        changed_cols = [
            col for col in cols_to_check
            if row[f"{col}_current"] != row[f"{col}_next"]
        ]
        if changed_cols:
            updated_row = row.to_dict()
            updated_row["Month Changed"] = month
            updated_row["Source"] = source
            updated_row["Cols Changed"] = ", ".join(changed_cols)
            changed_records.append(updated_row)

    # Convert changed records to a DataFrame
    if changed_records:
        changes_df = pd.DataFrame(changed_records)
        changes_df = changes_df.reindex(columns=current_df.columns.tolist() + ["Month Changed", "Source", "Cols Changed"], fill_value="")
        return changes_df
    else:
        return pd.DataFrame(columns=current_df.columns.tolist() + ["Month Changed", "Source", "Cols Changed"])

def process_monthly_files(master_df, monthly_files, cols_to_check_gha, cols_to_check_open_pos):
    """
    Process monthly files to find new and changed records.
    """
    new_df = pd.DataFrame(columns=master_df.columns.tolist() + ["Month Added", "Source"])
    changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ["Month Changed", "Source", "Cols Changed"])

    for i in range(len(monthly_files) - 1):
        current_month, current_file = monthly_files[i]
        next_month, next_file = monthly_files[i + 1]

        # Process GHA files
        if "gha" in current_file.lower():
            current_df = pd.read_excel(current_file)
            next_df = pd.read_excel(next_file)

            # Find new records
            new_df = pd.concat([new_df, find_new_records(current_df, master_df, current_month, "GHA")], ignore_index=True)

            # Find changed records
            changes_df = pd.concat([changes_df, find_changed_records(current_df, next_df, cols_to_check_gha, next_month, "GHA")], ignore_index=True)

        # Process Open Position files
        if "open position" in current_file.lower():
            current_df = pd.read_excel(current_file)
            next_df = pd.read_excel(next_file)

            # Find new records
            new_df = pd.concat([new_df, find_new_records(current_df, master_df, current_month, "Open Position")], ignore_index=True)

            # Find changed records
            changes_df = pd.concat([changes_df, find_changed_records(current_df, next_df, cols_to_check_open_pos, next_month, "Open Position")], ignore_index=True)

    return new_df, changes_df

# Example usage:
if __name__ == "__main__":
    # Load master file
    master_df = pd.read_excel("master_file.xlsx")

    # List of monthly files with format [(month, file_path), ...]
    monthly_files = [
        ("Jan 24", "jan_24_gha.xlsx"),
        ("Feb 24", "feb_24_gha.xlsx"),
        ("Mar 24", "mar_24_gha.xlsx"),
        # Add more files as needed
    ]

    # Columns to check for GHA and Open Position files
    cols_to_check_gha = ["BF Level 3", "BF Level 4", "Cost Center"]
    cols_to_check_open_pos = ["Position Name", "Job Level"]

    # Process the files
    new_df, changes_df = process_monthly_files(master_df, monthly_files, cols_to_check_gha, cols_to_check_open_pos)

    # Save results
    new_df.to_excel("new_records.xlsx", index=False)
    changes_df.to_excel("changed_records.xlsx", index=False)


In [None]:
def process_monthly_file(file_df, master_df, month, source_type, updated_records, new_records):
    """
    Process the monthly file and compare it with the master dataframe to identify changes or additions.

    Args:
        file_df (DataFrame): The dataframe for the current month's data.
        master_df (DataFrame): The master dataframe containing previous records.
        month (str): The current month being processed (e.g., "Feb 24").
        source_type (str): The type of data source (e.g., "GHA", "Open Position").
        updated_records (list): List to store records that have been updated.
        new_records (list): List to store records that are new.

    Returns:
        None: Updates `updated_records` and `new_records` in place.
    """
    # Normalize blanks and NaN in both dataframes
    file_df = file_df.fillna("").replace("nan", "")
    master_df = master_df.fillna("").replace("nan", "")

    for _, row in file_df.iterrows():
        if row["Position ID"] in master_df["Position ID"].values:
            # Fetch the corresponding row in the master file
            master_row = master_df[master_df["Position ID"] == row["Position ID"]].iloc[0]

            # Identify columns that have changed
            cols_changed = []
            for col in file_df.columns:
                if col in master_df.columns and row[col] != master_row[col]:
                    cols_changed.append(col)

            if cols_changed:
                # Record the updated row
                updated_row = row.to_dict()
                updated_row["Month Changed"] = month
                updated_row["Cols Changed"] = ", ".join(cols_changed)
                updated_row["Source"] = source_type
                updated_records.append(updated_row)
        else:
            # Check if Employee ID exists in master but Position ID has changed
            if "Employee ID" in row and not master_df[master_df["Employee ID"] == row["Employee ID"]].empty:
                # Fetch the corresponding row in master where Employee ID matches
                master_row = master_df[master_df["Employee ID"] == row["Employee ID"]].iloc[0]

                # Check if Position ID has actually changed
                if master_row["Position ID"] != row["Position ID"]:
                    updated_row = row.to_dict()
                    updated_row["Month Changed"] = month
                    updated_row["Cols Changed"] = "Position ID"
                    updated_row["Source"] = source_type
                    updated_records.append(updated_row)
            else:
                # Handle cases where both Employee ID and Position ID are blank or unchanged
                if (
                    "Employee ID" in row
                    and row["Employee ID"] == ""  # Current file has blank Employee ID
                    and not master_df[(master_df["Position ID"] == row["Position ID"]) & (master_df["Employee ID"] == "")].empty
                ):
                    # Skip adding to updated_records because nothing has changed
                    continue

                # Add new record if no match found
                new_row = row.to_dict()
                new_row["Month Added"] = month
                new_row["Source"] = source_type
                new_records.append(new_row)


In [None]:
def file_function_and_new_records(file_df, master_df, month, source_type):
    """
    Process the monthly file and compare it with the master dataframe to identify changes or additions.

    Args:
        file_df (DataFrame): The dataframe for the current month's data.
        master_df (DataFrame): The master dataframe containing previous records.
        month (str): The current month being processed (e.g., "Feb 24").
        source_type (str): The type of data source (e.g., "GHA", "Open Position").

    Returns:
        updated_records (list): List of records that have been updated.
        new_records (list): List of records that are new.
    """
    updated_records = []
    new_records = []

    # Normalize blanks and NaN in both dataframes
    file_df = file_df.fillna("").replace("nan", "")
    master_df = master_df.fillna("").replace("nan", "")

    for _, row in file_df.iterrows():
        if row["Position ID"] in master_df["Position ID"].values:
            # Fetch the corresponding row in the master file
            master_row = master_df[master_df["Position ID"] == row["Position ID"]].iloc[0]

            # Identify columns that have changed
            cols_changed = []
            for col in file_df.columns:
                if col in master_df.columns and row[col] != master_row[col]:
                    cols_changed.append(col)

            if cols_changed:
                # Record the updated row
                updated_row = row.to_dict()
                updated_row["Month Changed"] = month
                updated_row["Cols Changed"] = ", ".join(cols_changed)
                updated_row["Source"] = source_type
                updated_records.append(updated_row)
        else:
            # Check if Employee ID exists in master but Position ID has changed
            if "Employee ID" in row and not master_df[master_df["Employee ID"] == row["Employee ID"]].empty:
                # Fetch the corresponding row in master where Employee ID matches
                master_row = master_df[master_df["Employee ID"] == row["Employee ID"]].iloc[0]

                # Check if Position ID has actually changed
                if master_row["Position ID"] != row["Position ID"]:
                    updated_row = row.to_dict()
                    updated_row["Month Changed"] = month
                    updated_row["Cols Changed"] = "Position ID"
                    updated_row["Source"] = source_type
                    updated_records.append(updated_row)
            else:
                # Handle cases where both Employee ID and Position ID are blank or unchanged
                if (
                    "Employee ID" in row
                    and row["Employee ID"] == ""  # Current file has blank Employee ID
                    and not master_df[(master_df["Position ID"] == row["Position ID"]) & (master_df["Employee ID"] == "")].empty
                ):
                    # Skip adding to updated_records because nothing has changed
                    continue

                # Add new record if no match found
                new_row = row.to_dict()
                new_row["Month Added"] = month
                new_row["Source"] = source_type
                new_records.append(new_row)

    return updated_records, new_records


In [None]:
else:
    # Check if Employee ID exists in master but Position ID has changed
    if "Employee ID" in row and not master_df[master_df["Employee ID"] == row["Employee ID"]].empty:
        # Fetch the corresponding row in master where Employee ID matches
        master_row = master_df[master_df["Employee ID"] == row["Employee ID"]].iloc[0]
        
        # Check if Position ID has actually changed
        if master_row["Position ID"] != row["Position ID"]:
            updated_row = row.to_dict()
            updated_row["Month Changed"] = month
            updated_row["Cols Changed"] = "Position ID"
            updated_row["Source"] = source_type
            updated_records.append(updated_row)
    else:
        # Handle cases where both Employee ID and Position ID are blank or unchanged
        if (
            "Employee ID" in row
            and row["Employee ID"] == ""  # Current file has blank Employee ID
            and not master_df[(master_df["Position ID"] == row["Position ID"]) & (master_df["Employee ID"] == "")].empty
        ):
            # Skip adding to updated_records because nothing has changed
            continue

        # Add new record if no match found
        new_row = row.to_dict()
        new_row["Month Added"] = month
        new_row["Source"] = source_type
        new_records.append(new_row)


In [44]:
import pandas as pd

# Load data
master_df = pd.read_excel(r"Input\\" + "master_file.xlsx")
feb24_gha_df = pd.read_excel(r"Input\\" + "Feb24_gha.xlsx")
feb24_open_df = pd.read_excel(r"Input\\" + "Feb24_open.xlsx")
mar24_gha_df = pd.read_excel(r"Input\\" + "Mar24_gha.xlsx")
mar24_open_df = pd.read_excel(r"Input\\" + "Mar24_open.xlsx")

# Strip spaces from co`lumn names to avoid mismatch
master_df.columns = master_df.columns.str.strip()
feb24_gha_df.columns = feb24_gha_df.columns.str.strip()
feb24_open_df.columns = feb24_open_df.columns.str.strip()
mar24_gha_df.columns = mar24_gha_df.columns.str.strip()
mar24_open_df.columns = mar24_open_df.columns.str.strip()

# Part 1: Find updated records (matching Position ID and Employee ID, but with changes in other columns)
def find_updated_records(current_df, master_df):
    # Merge dataframes on Position ID and Employee ID to compare current vs master
    merged_df = current_df.merge(master_df, on=['Position ID', 'Employee ID'], how='left', suffixes=('_current', '_master'))
    
    # Find columns that have been updated (skip Position ID and Employee ID)
    updated_columns = [col for col in current_df.columns if col not in ['Position ID', 'Employee ID']]
    
    # Create a mask to find rows where any column (other than Position ID and Employee ID) has changed
    updated_mask = merged_df.apply(lambda row: any(row[col + '_current'] != row[col + '_master'] for col in updated_columns), axis=1)
    
    # Filter out rows that have been updated
    updated_records = merged_df[updated_mask]
    
    return updated_records

# Part 2: Find new records (Position ID and Employee ID exist only in the current data, not in the master data)
def find_new_records(current_df, master_df):
    # Merge dataframes on Position ID and Employee ID
    merged_df = current_df.merge(master_df, on=['Position ID', 'Employee ID'], how='left', suffixes=('_current', '_master'))
    
    # Filter rows where Position ID and Employee ID do not have a match in the master data (i.e., new records)
    new_records = merged_df[merged_df['Position ID_master'].isna() & merged_df['Employee ID_master'].isna()]
    
    return new_records

# Find updated records in Feb24 GHA data
updated_gha_records = find_updated_records(feb24_gha_df, master_df)
print("Updated GHA Records:")
print(updated_gha_records)

# Find new records in Feb24 GHA data
new_gha_records = find_new_records(feb24_gha_df, master_df)
print("New GHA Records:")
print(new_gha_records)


Updated GHA Records:
   Position ID Employee ID Col1_current  Col2_current Col3_current  \
1           20        E002    B_Updated           250            Y   
2           40        E004            D           400            W   

  Col1_master  Col2_master Col3_master ColZ  
1           B        200.0           Y  NaN  
2         NaN          NaN         NaN  NaN  


KeyError: 'Position ID_master'

In [40]:
import pandas as pd

# Function to check and process changes for GHA and Open Positions files
def process_changes(file_df, source, cols_to_check, month_year):
    global updateddf

    # Strip any leading/trailing spaces in column names to avoid mismatch
    file_df.columns = file_df.columns.str.strip()
    master_df.columns = master_df.columns.str.strip()

    # Debug: Print column names of file_df and master_df to ensure 'Position ID' and 'Employee ID' are present
    print(f"\nColumns in {source} DataFrame:")
    print(file_df.columns)
    
    print("\nMaster DataFrame Columns for Merge:")
    print(master_df.columns)

    # Ensure the correct columns are present before proceeding
    if 'Position ID' not in file_df.columns:
        print(f"Error: 'Position ID' not found in {source} data.")
        return
    if 'Employee ID' not in file_df.columns and source != 'Open Position':
        print(f"Error: 'Employee ID' not found in {source} data.")
        return

    # Show a sample of the first few rows to understand the structure
    print("\nSample data from file_df:")
    print(file_df.head())
    
    print("\nSample data from master_df:")
    print(master_df.head())

    # Merge based on 'Position ID' and 'Employee ID' (for GHA) or just 'Position ID' (for Open Position)
    if source == 'Open Position':
        print("\nMerging on 'Position ID' only for Open Position")
        merged_df = file_df.merge(master_df, on=['Position ID'], how='left', suffixes=('_current', '_master'))
    else:
        print("\nMerging on both 'Position ID' and 'Employee ID' for GHA")
        merged_df = file_df.merge(master_df, on=['Position ID', 'Employee ID'], how='left', suffixes=('_current', '_master'))

    # Print the columns of the merged DataFrame to debug
    print(f"\nMerged DataFrame columns for {source}:")
    print(merged_df.columns)

    # Process the merged DataFrame
    for col in cols_to_check:
        current_col = f'{col}_current'
        master_col = f'{col}_master'

        # Ensure both current and master columns exist
        if current_col in merged_df.columns and master_col in merged_df.columns:
            merged_df[f'{col}_changed'] = merged_df[current_col] != merged_df[master_col]
            changed_rows = merged_df[merged_df[f'{col}_changed'] == True]
            changed_rows['Reason Changed'] = f'{col} Changed'
            changed_rows['Month Changed'] = month_year
            changed_rows['Source'] = source
            updateddf = pd.concat([updateddf, changed_rows[['Position ID', 'Employee ID', 'Month Changed', 'Reason Changed', 'Source']]])

        elif current_col in merged_df.columns:
            merged_df[f'{col}_changed'] = False  # Defaulting to False since master column is missing
            changed_rows = merged_df[merged_df[f'{col}_changed'] == True]
            changed_rows['Reason Changed'] = f'{col} Changed'
            changed_rows['Month Changed'] = month_year
            changed_rows['Source'] = source
            updateddf = pd.concat([updateddf, changed_rows[['Position ID', 'Employee ID', 'Month Changed', 'Reason Changed', 'Source']]])

    # Handle new records (those not found in the master file)
    for index, row in merged_df.iterrows():
        if pd.isna(row.get('Position ID_master')):  # No 'Employee ID' for Open Position source
            new_record = row.copy()
            new_record['Month Added'] = month_year
            new_record['Source'] = source
            updateddf = pd.concat([updateddf, new_record[['Position ID', 'Employee ID', 'Month Added', 'Source']]])

# Assuming you have defined the file paths for the respective files
# feb24_gha_df = pd.read_excel('feb24_gha.xlsx')
# feb24_open_df = pd.read_excel('feb24_open.xlsx')
# master_df = pd.read_excel('master_file.xlsx')

# Define file paths
master_df = pd.read_excel(r"Input\\" + "master_file.xlsx")
feb24_gha_df = pd.read_excel(r"Input\\" + "Feb24_gha.xlsx")
feb24_open_df =pd.read_excel( r"Input\\" + "Feb24_open.xlsx")
mar24_gha_df = pd.read_excel(r"Input\\" + "Mar24_gha.xlsx")
mar24_open_df = pd.read_excel(r"Input\\" + "Mar24_open.xlsx")

# List of columns to check for changes
cols_to_check_gha = ['Col1', 'Col2', 'Col3']  # Adjust columns to actual ones
cols_to_check_open_pos = ['Col1', 'Col2', 'ColZ']  # Adjust columns to actual ones

# Initialize empty dataframe to store updates
updateddf = pd.DataFrame()

# Process changes for February 24th GHA and Open Position files
month_year = 'Feb 24'
process_changes(feb24_gha_df, 'GHA', cols_to_check_gha, month_year)
process_changes(feb24_open_df, 'Open Position', cols_to_check_open_pos, month_year)

# If you want to save the updated DataFrame to an Excel file
updateddf.to_excel("Updated_Records.xlsx", index=False)

# Output the updated records for review
print(updateddf)



Columns in GHA DataFrame:
Index(['Position ID', 'Employee ID', 'Col1', 'Col2', 'Col3'], dtype='object')

Master DataFrame Columns for Merge:
Index(['Position ID', 'Employee ID', 'Col1', 'Col2', 'Col3', 'ColZ'], dtype='object')

Sample data from file_df:
   Position ID Employee ID       Col1  Col2 Col3
0           10        E001          A   100    X
1           20        E002  B_Updated   250    Y
2           40        E004          D   400    W

Sample data from master_df:
   Position ID Employee ID Col1  Col2 Col3 ColZ
0           10        E001    A   100    X  NaN
1           20        E002    B   200    Y  NaN
2           30        E003    C   300    Z  NaN
3           31         NaN   DD  9000  NaN   SS
4           50         NaN    P   500  NaN    U

Merging on both 'Position ID' and 'Employee ID' for GHA

Merged DataFrame columns for GHA:
Index(['Position ID', 'Employee ID', 'Col1_current', 'Col2_current',
       'Col3_current', 'Col1_master', 'Col2_master', 'Col3_master', 'Co

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  changed_rows['Reason Changed'] = f'{col} Changed'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  changed_rows['Month Changed'] = month_year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  changed_rows['Source'] = source
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

In [13]:
import pandas as pd

# Function to load the master file and compare it with a new file
def load_and_compare(master_file_path, new_file_path, has_employee_id=True):
    # Load the master file and new file into dataframes
    master_df = pd.read_excel(master_file_path)
    new_df = pd.read_excel(new_file_path)
    
    # Ensure Position ID is treated as a string for consistency
    master_df['Position ID'] = master_df['Position ID'].astype(str)
    new_df['Position ID'] = new_df['Position ID'].astype(str)

    # Check if Employee ID exists in both files, adjust accordingly
    if has_employee_id:
        master_df['Employee ID'] = master_df['Employee ID'].astype(str)
        new_df['Employee ID'] = new_df['Employee ID'].astype(str)
        new_records_df = new_df[~new_df[['Position ID', 'Employee ID']].isin(master_df[['Position ID', 'Employee ID']]).all(axis=1)]
        changed_records_df = new_df.merge(master_df, on=['Position ID', 'Employee ID'], how='inner', suffixes=('_new', '_master'))
    else:
        # If Employee ID is not present, compare based on 'Position ID' only
        new_records_df = new_df[~new_df['Position ID'].isin(master_df['Position ID'])]
        changed_records_df = new_df.merge(master_df, on='Position ID', how='inner', suffixes=('_new', '_master'))

    # For changed records, we check for differences in columns that exist in both files
    compare_columns = [col for col in new_df.columns if col != 'Position ID' and col != 'Employee ID']
    
    # Only compare columns that exist in both new_df and master_df
    compare_columns = [col for col in compare_columns if f'{col}_new' in changed_records_df.columns and f'{col}_master' in changed_records_df.columns]

    # Compare the columns and filter the changed records
    for col in compare_columns:
        changed_records_df = changed_records_df[changed_records_df[f'{col}_new'] != changed_records_df[f'{col}_master']]

    # Remove rows with no differences in the specified columns
    changed_records_df = changed_records_df.dropna(subset=[f'{col}_new' for col in compare_columns])
    
    return new_records_df, changed_records_df

# Define file paths
master_file_path = r"Input\\" + "master_file.xlsx"
feb24_gha_file = r"Input\\" + "Feb24_gha.xlsx"
feb24_open_file = r"Input\\" + "Feb24_open.xlsx"
mar24_gha_file = r"Input\\" + "Mar24_gha.xlsx"
mar24_open_file = r"Input\\" + "Mar24_open.xlsx"

# Compare Feb24 GHA file with the master file (Employee ID is present)
new_records_feb24_gha, changed_records_feb24_gha = load_and_compare(master_file_path, feb24_gha_file, has_employee_id=True)

# Compare Feb24 Open file with the master file (No Employee ID in open file)
new_records_feb24_open, changed_records_feb24_open = load_and_compare(master_file_path, feb24_open_file, has_employee_id=False)

# Compare Mar24 GHA file with the master file (Employee ID is present)
new_records_mar24_gha, changed_records_mar24_gha = load_and_compare(master_file_path, mar24_gha_file, has_employee_id=True)

# Compare Mar24 Open file with the master file (No Employee ID in open file)
new_records_mar24_open, changed_records_mar24_open = load_and_compare(master_file_path, mar24_open_file, has_employee_id=False)

# Print results (or you can save them to a new file)
print("New Records (Feb24 GHA):")
print(new_records_feb24_gha)

print("Changed Records (Feb24 GHA):")
print(changed_records_feb24_gha)

print("New Records (mar24 GHA):")
print(new_records_mar24_gha)

print("Changed Records (mar24 GHA):")
print(changed_records_mar24_gha)

print("New Records (Feb24 open):")
print(new_records_feb24_open)

print("Changed Records (Feb24 open):")
print(changed_records_feb24_open)

print("New Records (MAr24 open):")
print(new_records_mar24_open)

print("Changed Records (MAr24 open):")
print(changed_records_mar24_open)



# Repeat for other files (Feb24 Open, Mar24 GHA, etc.)


New Records (Feb24 GHA):
  Position ID Employee ID Col1  Col2 Col3
2          40        E004    D   400    W
Changed Records (Feb24 GHA):
Empty DataFrame
Columns: [Position ID, Employee ID, Col1_new, Col2_new, Col3_new, Col1_master, Col2_master, Col3_master]
Index: []
New Records (mar24 GHA):
  Position ID Employee ID       Col1  Col2 Col3
1          40        E004  D_Updated   400    W
2          50        E005          E   500    Z
Changed Records (mar24 GHA):
Empty DataFrame
Columns: [Position ID, Employee ID, Col1_new, Col2_new, Col3_new, Col1_master, Col2_master, Col3_master]
Index: []
New Records (Feb24 open):
  Position ID ColX  ColY ColZ
0          50    P   500    U
1          60    Q   600    V
2          70    R   700    T
Changed Records (Feb24 open):
Empty DataFrame
Columns: [Position ID, ColX, ColY, ColZ, Employee ID, Col1, Col2, Col3]
Index: []
New Records (MAr24 open):
  Position ID       ColX  ColY ColZ
0          50          P   500    U
1          60  Q_Updated   650

In [4]:
import pandas as pd

# Sample function to process GHA and Open Positions files
def process_month_files(master_df, month_files, cols_to_check_gha, cols_to_check_positions):
    newdf = pd.DataFrame()  # DataFrame to hold new records
    changesdf = pd.DataFrame()  # DataFrame to hold changed records
    
    # Iterate through each month's GHA and Open Positions files
    for month, files in month_files.items():
        gha_file, positions_file = files
        
        # Load the GHA and Open Positions files
        gha_df = pd.read_excel(gha_file)
        positions_df = pd.read_excel(positions_file)

        # Standardize column names if necessary (e.g., rename 'Position ID' to 'Position Number')
        gha_df['Position ID'] = gha_df['Position ID'].astype(str).str.zfill(6)  # Ensure 'Position ID' is a string with leading zeros
        positions_df['Position ID'] = positions_df['Position ID'].astype(str).str.zfill(6)

        # Checking new records and changes for GHA
        new_gha_records = gha_df[~gha_df['Position ID'].isin(master_df['Position ID'])]
        changed_gha_records = gha_df[gha_df['Position ID'].isin(master_df['Position ID'])]

        # Identify changes in GHA based on the specified columns
        for index, row in changed_gha_records.iterrows():
            master_row = master_df[master_df['Position ID'] == row['Position ID']].iloc[0]
            changed_columns = [col for col in cols_to_check_gha if row[col] != master_row[col]]
            
            if changed_columns:
                row['Change Month'] = month
                row['Changed Attributes'] = ', '.join(changed_columns)
                row['Source Name'] = 'GHA'
                changesdf = pd.concat([changesdf, row.to_frame().T], ignore_index=True)

        # Add new records from GHA
        new_gha_records['Month Added'] = month
        new_gha_records['Source Name'] = 'GHA'
        newdf = pd.concat([newdf, new_gha_records], ignore_index=True)

        # Checking new records and changes for Open Positions
        new_positions_records = positions_df[~positions_df['Position ID'].isin(master_df['Position ID'])]
        changed_positions_records = positions_df[positions_df['Position ID'].isin(master_df['Position ID'])]

        # Identify changes in Open Positions based on the specified columns
        for index, row in changed_positions_records.iterrows():
            master_row = master_df[master_df['Position ID'] == row['Position ID']].iloc[0]
            changed_columns = [col for col in cols_to_check_positions if row[col] != master_row[col]]
            
            if changed_columns:
                row['Change Month'] = month
                row['Changed Attributes'] = ', '.join(changed_columns)
                row['Source Name'] = 'Open Positions'
                changesdf = pd.concat([changesdf, row.to_frame().T], ignore_index=True)

        # Add new records from Open Positions
        new_positions_records['Month Added'] = month
        new_positions_records['Source Name'] = 'Open Positions'
        newdf = pd.concat([newdf, new_positions_records], ignore_index=True)

    # Return the new and changed records dataframes
    return newdf, changesdf

# Define the columns to check for GHA and Open Positions (you can modify these lists)
cols_to_check_gha = ['Col1', 'Col2', 'Col3']  # Replace with the actual columns for GHA
cols_to_check_positions = ['ColX', 'ColY', 'ColZ']  # Replace with the actual columns for Open Positions

# Example of files for the month comparison (make sure to adjust paths and file names)
month_files = {
    'Feb 24': [r"Input\\"+'Feb24_gha.xlsx', r"Input\\"+'Feb24_open.xlsx'],
    'Mar 24': [r"Input\\"+'Mar24_gha.xlsx', r"Input\\"+'Mar24_open.xlsx']
}

# Load the master file (adjust the file path as needed)
master_df = pd.read_excel(r"Input\\"+'master_file.xlsx')

# Process the files and get the new and changed records
newdf, changesdf = process_month_files(master_df, month_files, cols_to_check_gha, cols_to_check_positions)

# Optionally, save the new and changed records to Excel files for further analysis
newdf.to_excel(r"Input\\"+'New_Records.xlsx', index=False)
changesdf.to_excel(r"Input\\"+'Changed_Records.xlsx', index=False)

# Print the new and changed records dataframes for review
print("New Records DataFrame:")
print(newdf.head())

print("\nChanged Records DataFrame:")
print(changesdf.head())


New Records DataFrame:
  Position ID Employee ID       Col1   Col2 Col3 Month Added     Source Name  \
0      000010        E001          A  100.0    X      Feb 24             GHA   
1      000020        E002  B_Updated  250.0    Y      Feb 24             GHA   
2      000040        E004          D  400.0    W      Feb 24             GHA   
3      000050         NaN        NaN    NaN  NaN      Feb 24  Open Positions   
4      000060         NaN        NaN    NaN  NaN      Feb 24  Open Positions   

  ColX   ColY ColZ  
0  NaN    NaN  NaN  
1  NaN    NaN  NaN  
2  NaN    NaN  NaN  
3    P  500.0    U  
4    Q  600.0    V  

Changed Records DataFrame:
Empty DataFrame
Columns: []
Index: []


In [None]:
import pandas as pd

# List of columns to track for changes
columns_to_check = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name', 'Work Location City']

def track_changes_across_months(master_df, mom_gha_file, mom_open_positions_file, month):
    # Ensure consistency in data types
    master_df['Position ID'] = master_df['Position ID'].astype(str)
    mom_gha_file['Position ID'] = mom_gha_file['Position ID'].astype(str)
    mom_gha_file['Employee ID'] = mom_gha_file['Employee ID'].astype(str)
    mom_open_positions_file['Position ID'] = mom_open_positions_file['Position ID'].astype(str)

    # Initialize DataFrames to store changed and updated records
    changed_rows = []
    updated_rows = []

    # Combine the merged dataframe with previous month's data
    merged_df = master_df.copy()

    # Track changes in GHA file (source == 'gha')
    for index, row in merged_df.iterrows():
        pos_id = row['Position ID']
        emp_id = row['Employee ID']
        
        # Filter the mom_gha_file for the matching Position ID and Employee ID
        gha_match = mom_gha_file[(mom_gha_file['Position ID'] == pos_id) & (mom_gha_file['Employee ID'] == emp_id)]

        if not gha_match.empty:
            changes = []
            for col in columns_to_check:
                if row[col] != gha_match.iloc[0][col]:  # Compare values for the specified columns
                    changes.append(f"{col} Changed")

            if changes:
                updated_row = row.copy()
                updated_row['Description'] = '; '.join(changes)
                updated_row['Month Changed'] = month
                updated_rows.append(updated_row)

            # If combination of Position ID and Employee ID has changed (new employee or position), track as changed
            if len(gha_match) > 1:  # More than one match, indicating a position/employee mismatch
                changed_row = row.copy()
                changed_row['Description'] = 'Position/Employee ID mismatch'
                changed_row['Month Changed'] = month
                changed_rows.append(changed_row)

    # Track changes in Open Positions file (source == 'open positions')
    for index, row in merged_df.iterrows():
        pos_id = row['Position ID']

        # Filter the mom_open_positions_file for the matching Position ID
        open_pos_match = mom_open_positions_file[mom_open_positions_file['Position ID'] == pos_id]

        if not open_pos_match.empty:
            changes = []
            for col in columns_to_check:
                # Exclude Work Location City and Employee ID from the comparison for Open Position
                if col != 'Work Location City' and col != 'Employee ID' and row[col] != open_pos_match.iloc[0][col]:
                    changes.append(f"{col} Changed")

            if changes:
                updated_row = row.copy()
                updated_row['Description'] = '; '.join(changes)
                updated_row['Month Changed'] = month
                updated_rows.append(updated_row)

    # Convert lists to DataFrames
    changed_rows_df = pd.DataFrame(changed_rows)
    updated_rows_df = pd.DataFrame(updated_rows)

    return changed_rows_df, updated_rows_df

# Example usage:
current_month = 'Mar-24'
changed_rows_df, updated_rows_df = track_changes_across_months(master_df, mom_gha_file, mom_open_positions_file, current_month)

# Display the result
print("Changed Rows DataFrame:")
print(changed_rows_df)

print("\nUpdated Rows DataFrame:")
print(updated_rows_df)


In [None]:
# Initialize new_records with the same columns as master_df plus 'Month_Added'
new_records = pd.DataFrame(columns=master_df.columns.tolist() + ['Month_Added'])

# Define function to process each month
def process_single_month(master_df, month, gha_file, open_pos_file, new_records):
    max_length = master_df['Position ID'].str.len().max()

    # Standardize Position IDs
    gha_file['Position ID'] = gha_file['Position ID'].str.zfill(max_length)
    open_pos_file['Position ID'] = open_pos_file['Position ID'].str.zfill(max_length)
    
    # Update master_df with 1/0 based on presence in GHA and Open Positions
    master_df[month] = np.where(
        master_df['Source'] == 'GHA', 
        master_df['Position ID'].apply(lambda x: 1 if x in gha_file['Position ID'].values else 0),
        np.where(
            master_df['Source'] == 'Open Positions', 
            master_df['Position ID'].apply(lambda x: 1 if x in open_pos_file['Position ID'].values else 0),
            np.nan
        )
    )
    
    # Identify new records in GHA not in master_df or previously in new_records
    gha_new_records = gha_file[~gha_file['Position ID'].isin(pd.concat([master_df['Position ID'], new_records['Position ID']]))]
    gha_new_records = gha_new_records.assign(Source='GHA', Month_Added=month)

    # Identify new records in Open Positions not in master_df or previously in new_records
    open_pos_new_records = open_pos_file[~open_pos_file['Position ID'].isin(pd.concat([master_df['Position ID'], new_records['Position ID']]))]
    open_pos_new_records = open_pos_new_records.assign(Source='Open Positions', Month_Added=month)

    # Combine new records found this month into one DataFrame
    new_month_records = pd.concat([gha_new_records, open_pos_new_records], ignore_index=True)

    # Add presence columns for each month in new_records, filling prior months with 0
    for mth in master_df.columns[2:]:  # Assuming month columns start from index 2
        if mth < month:
            new_month_records[mth] = 0  # Set prior months to 0
        elif mth == month:
            new_month_records[mth] = new_month_records.apply(
                lambda row: 1 if row['Position ID'] in gha_file['Position ID'].values or row['Position ID'] in open_pos_file['Position ID'].values else 0,
                axis=1
            )
        else:
            new_month_records[mth] = np.nan

    # Trim new_month_records to only include columns in master_df + 'Month_Added'
    new_month_records = new_month_records[master_df.columns.tolist() + ['Month_Added']]

    # Append to new_records DataFrame
    new_records = pd.concat([new_records, new_month_records], ignore_index=True)

    return master_df, new_records

# Process each month
for month, gha_file in gha_monthly_files.items():
    open_pos_file = open_pos_monthly_files[month]
    master_df, new_records = process_single_month(master_df, month, gha_file, open_pos_file, new_records)

# Fill any remaining NaN values in new_records with 0 for months not reached yet
for col in master_df.columns[2:]:  # Month columns assumed to start from index 2
    new_records[col].fillna(0, inplace=True)

# Display the updated DataFrames
print("Updated Master DataFrame:\n", master_df)
print("\nNew Records DataFrame:\n", new_records)


In [2]:
# Declare new_records with relevant columns
new_records = pd.DataFrame(columns=['Position ID', 'Source', 'Month Added'])

def process_monthly_data(master_df, gha_monthly_files, open_pos_monthly_files):
    global new_records
    max_length = master_df['Position ID'].str.len().max()
    
    for month in gha_monthly_files.keys():
        gha_df = gha_monthly_files[month].copy()
        open_pos_df = open_pos_monthly_files[month].copy()

        # Standardize Position IDs
        gha_df['Position ID'] = gha_df['Position ID'].str.zfill(max_length)
        open_pos_df['Position ID'] = open_pos_df['Position ID'].str.zfill(max_length)
        
        # Update master_df with 1/0 based on presence in GHA and Open Positions
        master_df[month] = np.where(
            master_df['Source'] == 'GHA', 
            master_df['Position ID'].apply(lambda x: 1 if x in gha_df['Position ID'].values else 0),
            np.where(
                master_df['Source'] == 'Open Positions', 
                master_df['Position ID'].apply(lambda x: 1 if x in open_pos_df['Position ID'].values else 0),
                np.nan
            )
        )
        
        # Identify new records in GHA not in master_df or previously in new_records
        gha_new_records = gha_df[~gha_df['Position ID'].isin(pd.concat([master_df['Position ID'], new_records['Position ID']]))]
        gha_new_records = gha_new_records.assign(Source='GHA', Month_Added=month)

        # Identify new records in Open Positions not in master_df or previously in new_records
        open_pos_new_records = open_pos_df[~open_pos_df['Position ID'].isin(pd.concat([master_df['Position ID'], new_records['Position ID']]))]
        open_pos_new_records = open_pos_new_records.assign(Source='Open Positions', Month_Added=month)

        # Append unique new records from both GHA and Open Positions
        new_records = pd.concat([new_records, gha_new_records, open_pos_new_records], ignore_index=True)
    
    return master_df, new_records

# Call the function and get updated master_df and new_records
master_df, new_records = process_monthly_data(master_df, gha_monthly_files, open_pos_monthly_files)

# Display the resulting DataFrames
print("Updated Master DataFrame:\n", master_df)
print("\nNew Records DataFrame:\n", new_records)


Updated Master DataFrame:
   Position ID          Source  Jan 24  Feb 24  Mar 24
0      000123             GHA     1.0     0.0     1.0
1      000456  Open Positions     1.0     0.0     0.0
2      000789             GHA     0.0     1.0     0.0
3      001001  Open Positions     0.0     1.0     1.0

New Records DataFrame:
   Position ID          Source Month Added Month_Added
0      002002             GHA         NaN      Feb 24
1      004004  Open Positions         NaN      Feb 24
2      003003             GHA         NaN      Mar 24
3      005005  Open Positions         NaN      Mar 24


In [None]:
import pandas as pd

# Additional columns to track for changes and their descriptions
additional_cols = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name']
additional_cols_descriptions = {col: f"{col} Changed" for col in additional_cols}

# Initialize changes DataFrame
changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ['Month', 'Description'])

for month, gha_file, open_file in zip(months, gha_files, open_files):
    # Load GHA and Open Position data for the current month
    gha_df = pd.read_excel(gha_file, sheet_name='Headcount - Employee Detail')
    open_pos_df = pd.read_excel(open_file)
    
    # Rename 'Position Number' to 'Position ID' to match master_df
    gha_df.rename(columns={'Position Number': 'Position ID'}, inplace=True)
    open_pos_df.rename(columns={'Position Number': 'Position ID'}, inplace=True)

    # Standardize Position ID and Employee ID to strings, with zero-padding for Position ID
    gha_df['Position ID'] = gha_df['Position ID'].astype(str).str.zfill(max_digits)
    gha_df['Employee ID'] = gha_df['Employee ID'].astype(str)
    open_pos_df['Position ID'] = open_pos_df['Position ID'].astype(str).str.zfill(max_digits)

    # Print column names to verify alignment
    print("Debug: Columns in master_df:", master_df.columns)
    print("Debug: Columns in gha_df:", gha_df.columns)
    
    # Merge on Position ID and Employee ID, with unique suffixes
    merged_df = pd.merge(
        master_df, gha_df,
        on=['Position ID', 'Employee ID'],
        how='outer',
        suffixes=('_master', '_gha')
    )
    
    # Check merged columns to verify expected columns are present
    print("Debug: Columns in merged_df after merging:", merged_df.columns)
    
    # If expected columns are missing, print a warning and skip the iteration
    if 'Employee ID_master' not in merged_df.columns or 'Employee ID_gha' not in merged_df.columns:
        print("Warning: Expected columns 'Employee ID_master' and 'Employee ID_gha' not found.")
        continue
    
    # Track changes in Position-Employee combinations
    for _, row in merged_df.iterrows():
        pos_id = row['Position ID']
        master_emp_id, gha_emp_id = row.get('Employee ID_master'), row.get('Employee ID_gha')
        
        # Detect Employee ID changes for the same Position ID
        if pd.notna(master_emp_id) and pd.notna(gha_emp_id) and master_emp_id != gha_emp_id:
            changes_df = changes_df.append({
                **row[['Position ID', 'Employee ID_master']],
                'Month': month,
                'Description': 'Position-Employee Combination Changed'
            }, ignore_index=True)

        # Detect changes in additional columns
        for col in additional_cols:
            master_value, gha_value = row.get(f"{col}_master"), row.get(f"{col}_gha")
            if pd.notna(master_value) and pd.notna(gha_value) and master_value != gha_value:
                changes_df = changes_df.append({
                    **row[['Position ID', 'Employee ID_master']],
                    'Month': month,
                    'Description': additional_cols_descriptions[col]
                }, ignore_index=True)

# Drop duplicates and keep the first occurrence
changes_df.drop_duplicates(subset=['Position ID', 'Employee ID_master', 'Description'], inplace=True)

# Output the changes DataFrame
print("Final changes_df:\n", changes_df)


In [None]:
import pandas as pd

# Additional columns and descriptions for changes
additional_cols = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name']
additional_cols_descriptions = {col: f"{col} Changed" for col in additional_cols}

# Initialize the changes DataFrame with the required columns
changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ['Month', 'Description'])

for month, gha_file, open_file in zip(months, gha_files, open_files):
    # Load data
    gha_df = pd.read_excel(gha_file, sheet_name='Headcount - Employee Detail')
    open_pos_df = pd.read_excel(open_file)
    
    # Standardize 'Position ID' formatting
    gha_df['Position ID'] = gha_df['Position ID'].astype(str).str.zfill(max_digits)
    open_pos_df['Position ID'] = open_pos_df['Position ID'].astype(str).str.zfill(max_digits)
    
    # Filter out rows with unspecified or empty Position IDs or Employee IDs in GHA data
    gha_df = gha_df[(gha_df['Position ID'] != 'unspecified') & (gha_df['Position ID'] != '')]
    gha_df = gha_df[(gha_df['Employee ID'] != 'unspecified') & (gha_df['Employee ID'] != '')]

    # Merge GHA data with master data
    print("Debug: GHA Columns before merging:", gha_df.columns)
    print("Debug: Master Columns before merging:", master_df.columns)
    
    merged_df = pd.merge(
        master_df, gha_df, 
        on=['Position ID', 'Employee ID'], 
        how='outer', 
        suffixes=('_master', '_gha')
    )
    
    # Debug column names after merging
    print("Debug: Columns in merged_df after merging:", merged_df.columns)
    
    # If columns aren't named as expected, skip further processing in this loop iteration
    if 'Employee ID_master' not in merged_df.columns or 'Employee ID_gha' not in merged_df.columns:
        print("Warning: Expected columns 'Employee ID_master' and 'Employee ID_gha' not found.")
        continue
    
    # Detect changes
    for _, row in merged_df.iterrows():
        pos_id, emp_id = row['Position ID'], row['Employee ID']
        master_emp_id, gha_emp_id = row['Employee ID_master'], row['Employee ID_gha']
        
        # Record changes in Position ID and Employee ID combinations
        if pd.notna(master_emp_id) and pd.notna(gha_emp_id) and master_emp_id != gha_emp_id:
            changes_df = changes_df.append({
                **row[['Position ID', 'Employee ID_master']],
                'Month': month,
                'Description': 'Position-Employee Combination Changed'
            }, ignore_index=True)

        # Check for additional column changes
        for col in additional_cols:
            master_value, gha_value = row.get(f"{col}_master"), row.get(f"{col}_gha")
            if pd.notna(master_value) and pd.notna(gha_value) and master_value != gha_value:
                changes_df = changes_df.append({
                    **row[['Position ID', 'Employee ID_master']],
                    'Month': month,
                    'Description': additional_cols_descriptions[col]
                }, ignore_index=True)

# Remove duplicates
changes_df.drop_duplicates(subset=['Position ID', 'Employee ID_master', 'Description'], inplace=True)

# Output the changes DataFrame
print(changes_df)


In [1]:
#option 1
import pandas as pd

# Define maximum digits for 'Position ID' formatting, based on master file creation
max_digits = max(len(str(pos_id)) for pos_id in master_df['Position ID'].astype(str))

# Define additional columns to check for changes, with corresponding descriptions
additional_cols = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name']
additional_cols_descriptions = {col: f"{col} Changed" for col in additional_cols}

# Initialize empty DataFrame to store changes
changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ['Month', 'Description'])

for month, gha_file, open_file in zip(months, gha_files, open_files):
    # Load and prepare GHA and Open Position data for the current month
    gha_df = pd.read_excel(gha_file, sheet_name='Headcount - Employee Detail')
    open_pos_df = pd.read_excel(open_file)
    
    # Ensure consistent 'Position ID' format (use zfill if needed)
    if 'Position Number' in gha_df.columns:
        gha_df.rename(columns={'Position Number': 'Position ID'}, inplace=True)
    gha_df['Position ID'] = gha_df['Position ID'].astype(str).str.zfill(max_digits)
    open_pos_df['Position ID'] = open_pos_df['Position ID'].astype(str).str.zfill(max_digits)
    
    # Filter out records where 'Position ID' or 'Employee ID' is unspecified or blank
    gha_df = gha_df[(gha_df['Position ID'] != 'unspecified') & (gha_df['Position ID'] != '')]
    gha_df = gha_df[(gha_df['Employee ID'] != 'unspecified') & (gha_df['Employee ID'] != '')]
    
    # Merge GHA data with master_df based on Position ID and Employee ID
    merged_df = pd.merge(master_df, gha_df, on=['Position ID', 'Employee ID'], how='outer', suffixes=('_master', '_gha'))

    # Identify changes in 'Position ID' and 'Employee ID' combinations, as well as additional columns
    for _, row in merged_df.iterrows():
        pos_id, emp_id = row['Position ID'], row['Employee ID']
        master_emp_id, gha_emp_id = row['Employee ID_master'], row['Employee ID_gha']
        
        # Check for changes in Position ID and Employee ID combinations
        if pd.notna(master_emp_id) and pd.notna(gha_emp_id) and master_emp_id != gha_emp_id:
            # Log change details if Position ID has a different Employee ID
            changes_df = changes_df.append({
                **row[['Position ID', 'Employee ID_master']],
                'Month': month,
                'Description': 'Position-Employee Combination Changed'
            }, ignore_index=True)

        # Check for changes in additional columns
        for col in additional_cols:
            master_value, gha_value = row[f"{col}_master"], row.get(f"{col}_gha")
            if pd.notna(master_value) and pd.notna(gha_value) and master_value != gha_value:
                changes_df = changes_df.append({
                    **row[['Position ID', 'Employee ID_master']],
                    'Month': month,
                    'Description': additional_cols_descriptions[col]
                }, ignore_index=True)

# Drop duplicates to avoid recording repeated changes in subsequent months
changes_df.drop_duplicates(subset=['Position ID', 'Employee ID_master', 'Description'], inplace=True)

# View or save changes_df to verify the output
print(changes_df)


  Position ID Employee ID  Jan 24  Feb 24  Mar 24
0      000123        E001       1       1       0
1      000456        E002       1       0       0
2      000789        E003       0       0       1


In [48]:
import pandas as pd

# Load data
master_df = pd.read_excel(r"Input\\" + "master_file.xlsx")
feb24_gha_df = pd.read_excel(r"Input\\" + "Feb24_gha.xlsx")
feb24_open_df = pd.read_excel(r"Input\\" + "Feb24_open.xlsx")
mar24_gha_df = pd.read_excel(r"Input\\" + "Mar24_gha.xlsx")
mar24_open_df = pd.read_excel(r"Input\\" + "Mar24_open.xlsx")

# Strip spaces from column names to avoid mismatch
master_df.columns = master_df.columns.str.strip()
feb24_gha_df.columns = feb24_gha_df.columns.str.strip()
feb24_open_df.columns = feb24_open_df.columns.str.strip()
mar24_gha_df.columns = mar24_gha_df.columns.str.strip()
mar24_open_df.columns = mar24_open_df.columns.str.strip()

# Function to find updated records
def find_updated_records(current_df, master_df):
    updated_records = []

    # Loop through current dataframe and compare each row with the master dataframe
    for index, row in current_df.iterrows():
        # Check if the row has a corresponding row in the master dataframe
        match = master_df[(master_df['Position ID'] == row['Position ID']) & 
                          (master_df['Employee ID'] == row['Employee ID'])]

        if not match.empty:
            # Compare columns (excluding Position ID and Employee ID)
            for col in row.index:
                if col not in ['Position ID', 'Employee ID']:
                    if row[col] != match[col].values[0]:
                        updated_records.append(row)
                        break
    
    # Convert updated records to a DataFrame
    updated_df = pd.DataFrame(updated_records)
    return updated_df

# Function to find new records
def find_new_records(current_df, master_df):
    # Find rows in current_df that don't exist in master_df
    new_records = current_df[~current_df['Position ID'].isin(master_df['Position ID']) | 
                             ~current_df['Employee ID'].isin(master_df['Employee ID'])]
    return new_records

# Part 1: Find updated records in Feb24 GHA data
updated_gha_records = find_updated_records(feb24_gha_df, master_df)
print("Updated GHA Records:")
print(updated_gha_records)

# Part 2: Find new records in Feb24 GHA data
new_gha_records = find_new_records(feb24_gha_df, master_df)
print("New GHA Records:")
print(new_gha_records)


Updated GHA Records:
   Position ID Employee ID       Col1  Col2 Col3
1           20        E002  B_Updated   250    Y
New GHA Records:
   Position ID Employee ID Col1  Col2 Col3
2           40        E004    D   400    W


In [49]:
import pandas as pd
import os

# Load data
master_df = pd.read_excel(r"Input\\" + "master_file.xlsx")
files = {
    "Feb24_gha": pd.read_excel(r"Input\\" + "Feb24_gha.xlsx"),
    "Mar24_gha": pd.read_excel(r"Input\\" + "Mar24_gha.xlsx"),
    "Feb24_open": pd.read_excel(r"Input\\" + "Feb24_open.xlsx"),
    "Mar24_open": pd.read_excel(r"Input\\" + "Mar24_open.xlsx")
}

# Ensure consistent column names
for key in files.keys():
    files[key].columns = files[key].columns.str.strip()

master_df.columns = master_df.columns.str.strip()

# Function to find updated records
def find_updated_records(current_df, reference_df, month_year, source, key_cols):
    updated_records = []

    for index, row in current_df.iterrows():
        # Find matching row in reference_df based on key_cols
        match = reference_df
        for key in key_cols:
            match = match[match[key] == row[key]]
        
        if not match.empty:
            cols_changed = []
            for col in current_df.columns:
                if col not in key_cols and col in reference_df.columns:
                    if row[col] != match[col].values[0]:
                        cols_changed.append(col)
            
            if cols_changed:
                row_data = row.to_dict()
                row_data["Month Changed"] = month_year
                row_data["Cols Changed"] = "; ".join(cols_changed)
                row_data["Source"] = source
                updated_records.append(row_data)

    updated_df = pd.DataFrame(updated_records)
    return updated_df

# Function to find new records
def find_new_records(current_df, reference_df, month_year, key_cols):
    new_records = current_df[
        ~current_df[key_cols].apply(tuple, axis=1).isin(reference_df[key_cols].apply(tuple, axis=1))
    ]
    new_records = new_records.copy()
    new_records["Month Added"] = month_year
    return new_records

# Process all GHA files
gha_key_cols = ["Position ID", "Employee ID"]
open_pos_key_cols = ["Position ID"]  # No Employee ID for Open Position

all_updated_records = []
all_new_records = []

for file_name, df in files.items():
    month_year = file_name.split("_")[0]  # Extract month from filename
    source_type = file_name.split("_")[1]  # Extract source type from filename

    if source_type == "gha":
        # Process GHA files
        updated_records = find_updated_records(df, master_df, month_year, source_type, gha_key_cols)
        new_records = find_new_records(df, master_df, month_year, gha_key_cols)
    elif source_type == "open":
        # Process Open Position files
        updated_records = find_updated_records(df, master_df, month_year, source_type, open_pos_key_cols)
        new_records = find_new_records(df, master_df, month_year, open_pos_key_cols)
    else:
        continue

    # Add to the master results
    if not updated_records.empty:
        all_updated_records.append(updated_records)
    if not new_records.empty:
        all_new_records.append(new_records)

# Combine all results into final DataFrames
final_updated_records = pd.concat(all_updated_records, ignore_index=True) if all_updated_records else pd.DataFrame()
final_new_records = pd.concat(all_new_records, ignore_index=True) if all_new_records else pd.DataFrame()

# Save results
output_dir = "Input\\"
os.makedirs(output_dir, exist_ok=True)

final_updated_records.to_excel(output_dir + "Updated_Records.xlsx", index=False)
final_new_records.to_excel(output_dir + "New_Records.xlsx", index=False)

print("Processing complete. Results saved in the 'Output' folder.")


Processing complete. Results saved in the 'Output' folder.


In [50]:
import pandas as pd
import os

# Load data
master_df = pd.read_excel(r"Input\\" + "master_file.xlsx")
gha_files = ["Feb24_gha.xlsx", "Mar24_gha.xlsx"]  # Extend this list as new files arrive
open_files = ["Feb24_open.xlsx", "Mar24_open.xlsx"]  # Extend similarly for open position files

# Strip column names
master_df.columns = master_df.columns.str.strip()

# Ensure column name consistency
def load_and_prepare(file_path):
    df = pd.read_excel(file_path)
    df.columns = df.columns.str.strip()
    return df

# Function to find updated records
def find_updated_records(current_df, reference_df, month_year, source, key_cols):
    updated_records = []
    
    for index, row in current_df.iterrows():
        match = reference_df
        for key in key_cols:
            match = match[match[key] == row[key]]
        
        if not match.empty:
            cols_changed = []
            for col in current_df.columns:
                if col not in key_cols and col in reference_df.columns:
                    if row[col] != match[col].values[0]:
                        cols_changed.append(col)
            
            if cols_changed:
                row_data = row.to_dict()
                row_data["Month Changed"] = month_year
                row_data["Cols Changed"] = "; ".join(cols_changed)
                row_data["Source"] = source
                updated_records.append(row_data)

    updated_df = pd.DataFrame(updated_records)
    return updated_df

# Function to find new records
def find_new_records(current_df, reference_df, month_year, key_cols):
    new_records = current_df[
        ~current_df[key_cols].apply(tuple, axis=1).isin(reference_df[key_cols].apply(tuple, axis=1))
    ]
    new_records = new_records.copy()
    new_records["Month Added"] = month_year
    return new_records

# Main function to process GHA files
def process_gha_files(master_df, files, key_cols):
    all_updated_records = []
    all_new_records = []

    previous_df = master_df  # Start with master as the reference
    for file in files:
        file_path = f"Input\\{file}"
        current_df = load_and_prepare(file_path)

        # Extract month and source
        month_year = file.split("_")[0]
        source_type = "gha"

        # Find updated and new records
        updated_records = find_updated_records(current_df, previous_df, month_year, source_type, key_cols)
        new_records = find_new_records(current_df, previous_df, month_year, key_cols)

        # Store results
        if not updated_records.empty:
            all_updated_records.append(updated_records)
        if not new_records.empty:
            all_new_records.append(new_records)

        # Update reference for next iteration
        previous_df = current_df

    # Combine results
    final_updated_records = pd.concat(all_updated_records, ignore_index=True) if all_updated_records else pd.DataFrame()
    final_new_records = pd.concat(all_new_records, ignore_index=True) if all_new_records else pd.DataFrame()

    return final_updated_records, final_new_records

# Process GHA files
gha_key_cols = ["Position ID", "Employee ID"]
final_updated_gha, final_new_gha = process_gha_files(master_df, gha_files, gha_key_cols)

# Save results for GHA
output_dir = "Input\\"
os.makedirs(output_dir, exist_ok=True)

final_updated_gha.to_excel(output_dir + "Updated_GHA_Records.xlsx", index=False)
final_new_gha.to_excel(output_dir + "New_GHA_Records.xlsx", index=False)

print("GHA processing complete. Results saved in the 'Output' folder.")

# Extend for Open Position files
def process_open_position_files(master_df, files, key_cols):
    all_updated_records = []
    all_new_records = []

    previous_df = master_df  # Start with master as the reference
    for file in files:
        file_path = f"Input\\{file}"
        current_df = load_and_prepare(file_path)

        # Extract month and source
        month_year = file.split("_")[0]
        source_type = "open"

        # Find updated and new records
        updated_records = find_updated_records(current_df, previous_df, month_year, source_type, key_cols)
        new_records = find_new_records(current_df, previous_df, month_year, key_cols)

        # Store results
        if not updated_records.empty:
            all_updated_records.append(updated_records)
        if not new_records.empty:
            all_new_records.append(new_records)

        # Update reference for next iteration
        previous_df = current_df

    # Combine results
    final_updated_records = pd.concat(all_updated_records, ignore_index=True) if all_updated_records else pd.DataFrame()
    final_new_records = pd.concat(all_new_records, ignore_index=True) if all_new_records else pd.DataFrame()

    return final_updated_records, final_new_records

# Process Open Position files
open_key_cols = ["Position ID"]  # No Employee ID for Open Position
final_updated_open, final_new_open = process_open_position_files(master_df, open_files, open_key_cols)

# Save results for Open Position
final_updated_open.to_excel(output_dir + "Updated_Open_Position_Records.xlsx", index=False)
final_new_open.to_excel(output_dir + "New_Open_Position_Records.xlsx", index=False)

print("Open Position processing complete. Results saved in the 'Output' folder.")


GHA processing complete. Results saved in the 'Output' folder.
Open Position processing complete. Results saved in the 'Output' folder.


In [51]:
# Combine updated and new records into a final DataFrame
def create_final_dataframe(updated_df, new_df):
    # Add missing columns to ensure consistent structure
    required_columns = ["Position ID", "Employee ID", "Month Changed", "Cols Changed", "Source", "Month Added"]
    
    for col in required_columns:
        if col not in updated_df.columns:
            updated_df[col] = None
        if col not in new_df.columns:
            new_df[col] = None

    # Align column order
    updated_df = updated_df[required_columns]
    new_df = new_df[required_columns]
    
    # Concatenate both DataFrames
    final_df = pd.concat([updated_df, new_df], ignore_index=True)
    
    return final_df

# Create final DataFrames for GHA and Open Position files
final_gha_df = create_final_dataframe(final_updated_gha, final_new_gha)
final_open_df = create_final_dataframe(final_updated_open, final_new_open)

# Save the final combined data
final_gha_df.to_excel(output_dir + "Final_GHA_Data.xlsx", index=False)
final_open_df.to_excel(output_dir + "Final_Open_Position_Data.xlsx", index=False)

print("Final combined data saved successfully.")


Final combined data saved successfully.


In [52]:
# Combine GHA and Open Position Data into a Single DataFrame
def create_combined_dataframe(updated_gha, new_gha, updated_open, new_open):
    # Concatenate updated and new records for GHA and Open Position
    combined_gha = pd.concat([updated_gha, new_gha], ignore_index=True)
    combined_open = pd.concat([updated_open, new_open], ignore_index=True)

    # Add missing columns to ensure consistent structure
    required_columns = [
        "Position ID", "Employee ID", "Month Changed", "Cols Changed",
        "Source", "Month Added", "Col1", "Col2", "Col3", "ColZ"
    ]

    for col in required_columns:
        if col not in combined_gha.columns:
            combined_gha[col] = None
        if col not in combined_open.columns:
            combined_open[col] = None

    # Align column order
    combined_gha = combined_gha[required_columns]
    combined_open = combined_open[required_columns]

    # Add a 'Data Type' column to differentiate records
    combined_gha['Data Type'] = 'GHA'
    combined_open['Data Type'] = 'Open Position'

    # Concatenate GHA and Open Position data into one DataFrame
    final_combined_df = pd.concat([combined_gha, combined_open], ignore_index=True)

    return final_combined_df


# Create final combined DataFrame
final_combined_df = create_combined_dataframe(
    final_updated_gha, final_new_gha, final_updated_open, final_new_open
)

# Save the final combined data
output_file = output_dir + "Final_Combined_Data.xlsx"
final_combined_df.to_excel(output_file, index=False)

print(f"Final combined data saved to {output_file}.")


Final combined data saved to Input\Final_Combined_Data.xlsx.


In [None]:
import pandas as pd

# Dictionary of monthly files
monthly_files = {
    'Jan 24': ("Input\\gha_jan_24.xlsx", "Input\\jan_24 open pos.xlsx"),
    'Feb 24': ("Input\\global headcount_02_24.xlsx", "Input\\feb open pos.xlsx"),
    # Add more entries as needed...
}

# Create combined DataFrame for all months
def process_all_months(master_file_path, monthly_files, output_file):
    # Load master file
    master_df = pd.read_excel(master_file_path)

    # Initialize empty DataFrames for final output
    final_gha_combined = pd.DataFrame()
    final_open_combined = pd.DataFrame()

    for month, (gha_file, open_pos_file) in monthly_files.items():
        # Load GHA and Open Position files for the month
        gha_df = pd.read_excel(gha_file)
        open_pos_df = pd.read_excel(open_pos_file)

        # Process GHA and Open Position files against the master
        updated_gha, new_gha = find_updated_and_new_records(gha_df, master_df, month, 'GHA')
        updated_open, new_open = find_updated_and_new_records(open_pos_df, master_df, month, 'Open Position')

        # Append processed data to the final DataFrames
        final_gha_combined = pd.concat([final_gha_combined, updated_gha, new_gha], ignore_index=True)
        final_open_combined = pd.concat([final_open_combined, updated_open, new_open], ignore_index=True)

        # Update master for the next iteration
        master_df = gha_df  # Assuming GHA is the master dataset

    # Combine GHA and Open Position into a single DataFrame
    final_combined_df = combine_all_records(final_gha_combined, final_open_combined)

    # Save to Excel
    final_combined_df.to_excel(output_file, index=False)
    print(f"Final combined data saved to {output_file}.")

# Function to find updated and new records
def find_updated_and_new_records(file_df, master_df, month, source_type):
    # Ensure Position ID column exists
    if "Position ID" not in file_df.columns:
        raise KeyError(f"'Position ID' not found in {source_type} data for {month}.")

    # Identify updated and new records
    updated_records = []
    new_records = []
    for _, row in file_df.iterrows():
        position_id = row.get("Position ID")
        master_row = master_df[master_df["Position ID"] == position_id]

        if not master_row.empty:
            # Check for updated columns
            changes = []
            for col in file_df.columns:
                if col in master_row and not pd.isna(row[col]) and row[col] != master_row[col].values[0]:
                    changes.append(col)
            if changes:
                updated_row = row.to_dict()
                updated_row["Month Changed"] = month
                updated_row["Cols Changed"] = "; ".join(changes)
                updated_row["Source"] = source_type
                updated_records.append(updated_row)
        else:
            # Add new record
            new_row = row.to_dict()
            new_row["Month Added"] = month
            new_row["Source"] = source_type
            new_records.append(new_row)

    # Convert to DataFrame
    updated_df = pd.DataFrame(updated_records)
    new_df = pd.DataFrame(new_records)

    return updated_df, new_df

# Function to combine GHA and Open Position records
def combine_all_records(final_gha_combined, final_open_combined):
    # Add missing columns for consistency
    required_columns = [
        "Position ID", "Employee ID", "Month Changed", "Cols Changed",
        "Source", "Month Added", "Col1", "Col2", "Col3", "ColZ"
    ]

    for col in required_columns:
        if col not in final_gha_combined.columns:
            final_gha_combined[col] = None
        if col not in final_open_combined.columns:
            final_open_combined[col] = None

    # Align column order
    final_gha_combined = final_gha_combined[required_columns]
    final_open_combined = final_open_combined[required_columns]

    # Add a 'Data Type' column
    final_gha_combined["Data Type"] = "GHA"
    final_open_combined["Data Type"] = "Open Position"

    # Combine all records
    final_combined_df = pd.concat([final_gha_combined, final_open_combined], ignore_index=True)

    return final_combined_df


# Specify master file and output file paths
master_file_path = "Input\\master_file.xlsx"
output_file = "Output\\Final_Combined_Data.xlsx"

# Process all files
process_all_months(master_file_path, monthly_files, output_file)


In [None]:
import pandas as pd

# Dictionary of monthly files
monthly_files = {
    'Jan 24': ("Input\\gha_jan_24.xlsx", "Input\\jan_24 open pos.xlsx"),
    'Feb 24': ("Input\\global headcount_02_24.xlsx", "Input\\feb open pos.xlsx"),
    # Add more entries as needed...
}

# Columns for comparison by file type
comparison_columns = {
    'GHA': ["Employee ID", "Position ID", "Col1", "Col2"],  # Specify GHA-specific columns
    'Open Position': ["Position ID", "Col1", "ColZ"]        # Specify Open Position-specific columns
}

# Create combined DataFrame for all months
def process_all_months(master_file_path, monthly_files, comparison_columns, output_file):
    # Load master file
    master_df = pd.read_excel(master_file_path)

    # Initialize empty DataFrames for final output
    final_gha_combined = pd.DataFrame()
    final_open_combined = pd.DataFrame()

    for month, (gha_file, open_pos_file) in monthly_files.items():
        # Load GHA and Open Position files for the month
        gha_df = pd.read_excel(gha_file)
        open_pos_df = pd.read_excel(open_pos_file)

        # Process GHA and Open Position files against the master
        updated_gha, new_gha = find_updated_and_new_records(
            gha_df, master_df, month, 'GHA', comparison_columns['GHA']
        )
        updated_open, new_open = find_updated_and_new_records(
            open_pos_df, master_df, month, 'Open Position', comparison_columns['Open Position']
        )

        # Append processed data to the final DataFrames
        final_gha_combined = pd.concat([final_gha_combined, updated_gha, new_gha], ignore_index=True)
        final_open_combined = pd.concat([final_open_combined, updated_open, new_open], ignore_index=True)

        # Update master for the next iteration
        master_df = gha_df  # Assuming GHA is the master dataset

    # Combine GHA and Open Position into a single DataFrame
    final_combined_df = combine_all_records(final_gha_combined, final_open_combined)

    # Save to Excel
    final_combined_df.to_excel(output_file, index=False)
    print(f"Final combined data saved to {output_file}.")

# Function to find updated and new records
def find_updated_and_new_records(file_df, master_df, month, source_type, comparison_cols):
    # Ensure Position ID column exists
    if "Position ID" not in file_df.columns:
        raise KeyError(f"'Position ID' not found in {source_type} data for {month}.")

    # Identify updated and new records
    updated_records = []
    new_records = []
    for _, row in file_df.iterrows():
        position_id = row.get("Position ID")
        master_row = master_df[master_df["Position ID"] == position_id]

        if not master_row.empty:
            # Check for updated columns
            changes = []
            for col in comparison_cols:
                if col in master_row and not pd.isna(row[col]) and row[col] != master_row[col].values[0]:
                    changes.append(col)
            if changes:
                updated_row = row.to_dict()
                updated_row["Month Changed"] = month
                updated_row["Cols Changed"] = "; ".join(changes)
                updated_row["Source"] = source_type
                updated_records.append(updated_row)
        else:
            # Add new record
            new_row = row.to_dict()
            new_row["Month Added"] = month
            new_row["Source"] = source_type
            new_records.append(new_row)

    # Convert to DataFrame
    updated_df = pd.DataFrame(updated_records)
    new_df = pd.DataFrame(new_records)

    return updated_df, new_df

# Function to combine GHA and Open Position records
def combine_all_records(final_gha_combined, final_open_combined):
    # Add missing columns for consistency
    required_columns = [
        "Position ID", "Employee ID", "Month Changed", "Cols Changed",
        "Source", "Month Added", "Col1", "Col2", "Col3", "ColZ"
    ]

    for col in required_columns:
        if col not in final_gha_combined.columns:
            final_gha_combined[col] = None
        if col not in final_open_combined.columns:
            final_open_combined[col] = None

    # Align column order
    final_gha_combined = final_gha_combined[required_columns]
    final_open_combined = final_open_combined[required_columns]

    # Add a 'Data Type' column
    final_gha_combined["Data Type"] = "GHA"
    final_open_combined["Data Type"] = "Open Position"

    # Combine all records
    final_combined_df = pd.concat([final_gha_combined, final_open_combined], ignore_index=True)

    return final_combined_df


# Specify master file and output file paths
master_file_path = "Input\\master_file.xlsx"
output_file = "Output\\Final_Combined_Data.xlsx"

# Process all files
process_all_months(master_file_path, monthly_files, comparison_columns, output_file)


In [None]:
import pandas as pd

# Dictionary of monthly files
monthly_files = {
    'Jan 24': ("Input\\gha_jan_24.xlsx", "Input\\jan_24 open pos.xlsx"),
    'Feb 24': ("Input\\global headcount_02_24.xlsx", "Input\\feb open pos.xlsx"),
    # Add more entries as needed...
}

# Columns for comparison by file type
comparison_columns = {
    'GHA': ["Employee ID", "Position ID", "Col1", "Col2"],  # Specify GHA-specific columns
    'Open Position': ["Position ID", "Col1", "ColZ"]        # Specify Open Position-specific columns
}

# Create combined DataFrame for all months
def process_all_months(master_file_path, monthly_files, comparison_columns, output_file):
    # Load master file
    master_df = pd.read_excel(master_file_path)

    # Initialize empty DataFrames for final output
    final_gha_combined = pd.DataFrame()
    final_open_combined = pd.DataFrame()

    for month, (gha_file, open_pos_file) in monthly_files.items():
        # Load GHA and Open Position files for the month
        gha_df = pd.read_excel(gha_file)
        open_pos_df = pd.read_excel(open_pos_file)

        # Process GHA and Open Position files against the master
        updated_gha, new_gha = find_updated_and_new_records(
            gha_df, master_df, month, 'GHA', comparison_columns
        )
        updated_open, new_open = find_updated_and_new_records(
            open_pos_df, master_df, month, 'Open Position', comparison_columns
        )

        # Append processed data to the final DataFrames
        final_gha_combined = pd.concat([final_gha_combined, updated_gha, new_gha], ignore_index=True)
        final_open_combined = pd.concat([final_open_combined, updated_open, new_open], ignore_index=True)

        # Update master for the next iteration
        master_df = gha_df  # Assuming GHA is the master dataset

    # Combine GHA and Open Position into a single DataFrame
    final_combined_df = combine_all_records(final_gha_combined, final_open_combined)

    # Save to Excel
    final_combined_df.to_excel(output_file, index=False)
    print(f"Final combined data saved to {output_file}.")

# Function to find updated and new records
def find_updated_and_new_records(file_df, master_df, month, source_type, comparison_columns):
    # Ensure Position ID column exists
    if "Position ID" not in file_df.columns:
        raise KeyError(f"'Position ID' not found in {source_type} data for {month}.")

    # Identify updated and new records
    updated_records = []
    new_records = []
    for _, row in file_df.iterrows():
        # Determine the actual source dynamically (if column exists)
        actual_source = row.get("Source", source_type)

        # Get comparison columns based on the source type
        cols_to_check = comparison_columns.get(actual_source, [])

        # Perform record comparisons
        position_id = row.get("Position ID")
        master_row = master_df[master_df["Position ID"] == position_id]

        if not master_row.empty:
            # Check for updated columns
            changes = []
            for col in cols_to_check:
                if col in master_row and not pd.isna(row[col]) and row[col] != master_row[col].values[0]:
                    changes.append(col)
            if changes:
                updated_row = row.to_dict()
                updated_row["Month Changed"] = month
                updated_row["Cols Changed"] = "; ".join(changes)
                updated_row["Source"] = actual_source
                updated_records.append(updated_row)
        else:
            # Add new record
            new_row = row.to_dict()
            new_row["Month Added"] = month
            new_row["Source"] = actual_source
            new_records.append(new_row)

    # Convert to DataFrame
    updated_df = pd.DataFrame(updated_records)
    new_df = pd.DataFrame(new_records)

    return updated_df, new_df

# Function to combine GHA and Open Position records
def combine_all_records(final_gha_combined, final_open_combined):
    # Add missing columns for consistency
    required_columns = [
        "Position ID", "Employee ID", "Month Changed", "Cols Changed",
        "Source", "Month Added", "Col1", "Col2", "Col3", "ColZ"
    ]

    for col in required_columns:
        if col not in final_gha_combined.columns:
            final_gha_combined[col] = None
        if col not in final_open_combined.columns:
            final_open_combined[col] = None

    # Align column order
    final_gha_combined = final_gha_combined[required_columns]
    final_open_combined = final_open_combined[required_columns]

    # Add a 'Data Type' column
    final_gha_combined["Data Type"] = "GHA"
    final_open_combined["Data Type"] = "Open Position"

    # Combine all records
    final_combined_df = pd.concat([final_gha_combined, final_open_combined], ignore_index=True)

    return final_combined_df


# Specify master file and output file paths
master_file_path = "Input\\master_file.xlsx"
output_file = "Output\\Final_Combined_Data.xlsx"

# Process all files
process_all_months(master_file_path, monthly_files, comparison_columns, output_file)


In [None]:
import pandas as pd

# Dictionary of monthly files
monthly_files = {
    'Jan 24': ("Input\\gha_jan_24.xlsx", "Input\\jan_24 open pos.xlsx"),
    'Feb 24': ("Input\\global headcount_02_24.xlsx", "Input\\feb open pos.xlsx"),
    # Add more entries as needed...
}

# Columns for comparison by file type
comparison_columns = {
    'GHA': ["Employee ID", "Position ID", "Col1", "Col2"],  # Specify GHA-specific columns
    'Open Position': ["Position ID", "Col1", "ColZ"]        # Specify Open Position-specific columns
}

# Create combined DataFrame for all months
def process_all_months(master_file_path, monthly_files, comparison_columns, output_file):
    # Load master file
    master_df = pd.read_excel(master_file_path)

    # Initialize empty DataFrames for final output
    final_gha_combined = pd.DataFrame()
    final_open_combined = pd.DataFrame()

    for month, (gha_file, open_pos_file) in monthly_files.items():
        # Load GHA and Open Position files for the month
        gha_df = pd.read_excel(gha_file)
        open_pos_df = pd.read_excel(open_pos_file)

        # Process GHA and Open Position files against the master
        updated_gha, new_gha = find_updated_and_new_records(
            gha_df, master_df, month, 'GHA', comparison_columns
        )
        updated_open, new_open = find_updated_and_new_records(
            open_pos_df, master_df, month, 'Open Position', comparison_columns
        )

        # Append processed data to the final DataFrames
        final_gha_combined = pd.concat([final_gha_combined, updated_gha, new_gha], ignore_index=True)
        final_open_combined = pd.concat([final_open_combined, updated_open, new_open], ignore_index=True)

        # Update master for the next iteration
        master_df = gha_df  # Assuming GHA is the master dataset

    # Combine GHA and Open Position into a single DataFrame
    final_combined_df = combine_all_records(final_gha_combined, final_open_combined)

    # Save to Excel
    final_combined_df.to_excel(output_file, index=False)
    print(f"Final combined data saved to {output_file}.")

# Function to find updated and new records
def find_updated_and_new_records(file_df, master_df, month, source_type, comparison_columns):
    # Ensure Position ID column exists
    if "Position ID" not in file_df.columns:
        raise KeyError(f"'Position ID' not found in {source_type} data for {month}.")

    # Identify updated and new records
    updated_records = []
    new_records = []
    for _, row in file_df.iterrows():
        # Determine the actual source dynamically (if column exists)
        actual_source = row.get("Source", source_type)

        # Get comparison columns based on the source type
        cols_to_check = comparison_columns.get(actual_source, [])

        # Perform record comparisons
        position_id = row.get("Position ID")
        master_row = master_df[master_df["Position ID"] == position_id]

        if not master_row.empty:
            # Check for updated columns
            changes = []
            for col in cols_to_check:
                if col in master_row and not pd.isna(row[col]) and row[col] != master_row[col].values[0]:
                    changes.append(col)
            if changes:
                updated_row = row.to_dict()
                updated_row["Month Changed"] = month
                updated_row["Cols Changed"] = "; ".join(changes)
                updated_row["Source"] = actual_source
                updated_records.append(updated_row)
        else:
            # Add new record
            new_row = row.to_dict()
            new_row["Month Added"] = month
            new_row["Source"] = actual_source
            new_records.append(new_row)

    # Convert to DataFrame
    updated_df = pd.DataFrame(updated_records)
    new_df = pd.DataFrame(new_records)

    return updated_df, new_df

# Function to combine GHA and Open Position records
def combine_all_records(final_gha_combined, final_open_combined):
    # Add missing columns for consistency
    required_columns = [
        "Position ID", "Employee ID", "Month Changed", "Cols Changed",
        "Source", "Month Added", "Col1", "Col2", "Col3", "ColZ"
    ]

    for col in required_columns:
        if col not in final_gha_combined.columns:
            final_gha_combined[col] = None
        if col not in final_open_combined.columns:
            final_open_combined[col] = None

    # Align column order
    final_gha_combined = final_gha_combined[required_columns]
    final_open_combined = final_open_combined[required_columns]

    # Add a 'Data Type' column
    final_gha_combined["Data Type"] = "GHA"
    final_open_combined["Data Type"] = "Open Position"

    # Combine all records
    final_combined_df = pd.concat([final_gha_combined, final_open_combined], ignore_index=True)

    return final_combined_df


# Specify master file and output file paths
master_file_path = "Input\\master_file.xlsx"
output_file = "Output\\Final_Combined_Data.xlsx"

# Process all files
process_all_months(master_file_path, monthly_files, comparison_columns, output_file)


In [None]:
def find_updated_and_new_records(file_df, reference_df, month, source_type, comparison_columns):
    # Filter records based on the 'Source' column
    file_df = file_df[file_df['Source'] == source_type]
    reference_df = reference_df[reference_df['Source'] == source_type]

    # Ensure Position ID exists
    if "Position ID" not in file_df.columns:
        raise KeyError(f"'Position ID' not found in {source_type} data for {month}.")

    # Retrieve the list of columns for comparison from the dictionary
    cols_to_compare = comparison_columns[source_type]

    # Identify updated and new records
    updated_records = []
    new_records = []

    for _, row in file_df.iterrows():
        position_id = row["Position ID"]
        ref_row = reference_df[reference_df["Position ID"] == position_id]

        if not ref_row.empty:
            # Check for changes in the comparison columns
            changes = []
            for col in cols_to_compare:
                if col in ref_row.columns and row[col] != ref_row[col].values[0]:
                    changes.append(col)

            if changes:
                updated_row = row.to_dict()
                updated_row["Month Changed"] = month
                updated_row["Cols Changed"] = "; ".join(changes)
                updated_records.append(updated_row)
        else:
            # Add new record
            new_row = row.to_dict()
            new_row["Month Added"] = month
            new_records.append(new_row)

    # Convert to DataFrame
    updated_df = pd.DataFrame(updated_records)
    new_df = pd.DataFrame(new_records)

    return updated_df, new_df


In [None]:
for month, (gha_file, open_pos_file) in monthly_files.items():
    # Load GHA and Open Position files for the month
    gha_df = pd.read_csv(gha_file)  # Using CSV
    open_pos_df = pd.read_csv(open_pos_file)

    # Process GHA and Open Position files against the master
    updated_gha, new_gha = find_updated_and_new_records(
        gha_df, master_df, month, 'GHA', comparison_columns
    )
    updated_open, new_open = find_updated_and_new_records(
        open_pos_df, master_df, month, 'Open Position', comparison_columns
    )

    # Append processed data to the final DataFrames
    final_gha_combined = pd.concat([final_gha_combined, updated_gha, new_gha], ignore_index=True)
    final_open_combined = pd.concat([final_open_combined, updated_open, new_open], ignore_index=True)

    # Update master for the next iteration
    master_df = pd.concat([master_df, gha_df, open_pos_df], ignore_index=True).drop_duplicates(
        subset=["Position ID"], keep="last"
    )
