In [1]:
def identify_new_records(master_df, gha_df, open_pos_df, month, new_records):
    # Find records in gha_df not present in master_df and prepare for adding to new_records
    gha_new_records = gha_df[~gha_df['Position ID'].isin(master_df['Position ID'])].copy()
    gha_new_records.loc[:, 'Source'] = 'GHA'
    gha_new_records.loc[:, 'Month Added'] = month

    # Find records in open_pos_df not present in master_df and prepare for adding to new_records
    open_pos_new_records = open_pos_df[~open_pos_df['Position ID'].isin(master_df['Position ID'])].copy()
    open_pos_new_records.loc[:, 'Source'] = 'Open Positions'
    open_pos_new_records.loc[:, 'Month Added'] = month

    # Append new records from both GHA and Open Positions to new_records DataFrame
    new_records = pd.concat([new_records, gha_new_records, open_pos_new_records], ignore_index=True)
    
    return new_records


Updated Master DataFrame:
   Position ID          Source  Jan 24  Feb 24  Mar 24
0      000123             GHA     1.0     0.0     1.0
1      000456  Open Positions     1.0     0.0     0.0
2      000789             GHA     0.0     1.0     0.0
3      001001  Open Positions     0.0     1.0     1.0

New Records DataFrame:
   Position ID          Source Month Added
0      002002             GHA      Feb 24
1      004004  Open Positions      Feb 24
2      003003             GHA      Mar 24
3      005005  Open Positions      Mar 24


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gha_new_records['Source'] = 'GHA'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gha_new_records['Month Added'] = month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  open_pos_new_records['Source'] = 'Open Positions'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

In [None]:
import pandas as pd

# Additional columns to track for changes and their descriptions
additional_cols = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name']
additional_cols_descriptions = {col: f"{col} Changed" for col in additional_cols}

# Initialize changes DataFrame
changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ['Month', 'Description'])

for month, gha_file, open_file in zip(months, gha_files, open_files):
    # Load GHA and Open Position data for the current month
    gha_df = pd.read_excel(gha_file, sheet_name='Headcount - Employee Detail')
    open_pos_df = pd.read_excel(open_file)
    
    # Rename 'Position Number' to 'Position ID' to match master_df
    gha_df.rename(columns={'Position Number': 'Position ID'}, inplace=True)
    open_pos_df.rename(columns={'Position Number': 'Position ID'}, inplace=True)

    # Standardize Position ID and Employee ID to strings, with zero-padding for Position ID
    gha_df['Position ID'] = gha_df['Position ID'].astype(str).str.zfill(max_digits)
    gha_df['Employee ID'] = gha_df['Employee ID'].astype(str)
    open_pos_df['Position ID'] = open_pos_df['Position ID'].astype(str).str.zfill(max_digits)

    # Print column names to verify alignment
    print("Debug: Columns in master_df:", master_df.columns)
    print("Debug: Columns in gha_df:", gha_df.columns)
    
    # Merge on Position ID and Employee ID, with unique suffixes
    merged_df = pd.merge(
        master_df, gha_df,
        on=['Position ID', 'Employee ID'],
        how='outer',
        suffixes=('_master', '_gha')
    )
    
    # Check merged columns to verify expected columns are present
    print("Debug: Columns in merged_df after merging:", merged_df.columns)
    
    # If expected columns are missing, print a warning and skip the iteration
    if 'Employee ID_master' not in merged_df.columns or 'Employee ID_gha' not in merged_df.columns:
        print("Warning: Expected columns 'Employee ID_master' and 'Employee ID_gha' not found.")
        continue
    
    # Track changes in Position-Employee combinations
    for _, row in merged_df.iterrows():
        pos_id = row['Position ID']
        master_emp_id, gha_emp_id = row.get('Employee ID_master'), row.get('Employee ID_gha')
        
        # Detect Employee ID changes for the same Position ID
        if pd.notna(master_emp_id) and pd.notna(gha_emp_id) and master_emp_id != gha_emp_id:
            changes_df = changes_df.append({
                **row[['Position ID', 'Employee ID_master']],
                'Month': month,
                'Description': 'Position-Employee Combination Changed'
            }, ignore_index=True)

        # Detect changes in additional columns
        for col in additional_cols:
            master_value, gha_value = row.get(f"{col}_master"), row.get(f"{col}_gha")
            if pd.notna(master_value) and pd.notna(gha_value) and master_value != gha_value:
                changes_df = changes_df.append({
                    **row[['Position ID', 'Employee ID_master']],
                    'Month': month,
                    'Description': additional_cols_descriptions[col]
                }, ignore_index=True)

# Drop duplicates and keep the first occurrence
changes_df.drop_duplicates(subset=['Position ID', 'Employee ID_master', 'Description'], inplace=True)

# Output the changes DataFrame
print("Final changes_df:\n", changes_df)


In [None]:
import pandas as pd

# Additional columns and descriptions for changes
additional_cols = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name']
additional_cols_descriptions = {col: f"{col} Changed" for col in additional_cols}

# Initialize the changes DataFrame with the required columns
changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ['Month', 'Description'])

for month, gha_file, open_file in zip(months, gha_files, open_files):
    # Load data
    gha_df = pd.read_excel(gha_file, sheet_name='Headcount - Employee Detail')
    open_pos_df = pd.read_excel(open_file)
    
    # Standardize 'Position ID' formatting
    gha_df['Position ID'] = gha_df['Position ID'].astype(str).str.zfill(max_digits)
    open_pos_df['Position ID'] = open_pos_df['Position ID'].astype(str).str.zfill(max_digits)
    
    # Filter out rows with unspecified or empty Position IDs or Employee IDs in GHA data
    gha_df = gha_df[(gha_df['Position ID'] != 'unspecified') & (gha_df['Position ID'] != '')]
    gha_df = gha_df[(gha_df['Employee ID'] != 'unspecified') & (gha_df['Employee ID'] != '')]

    # Merge GHA data with master data
    print("Debug: GHA Columns before merging:", gha_df.columns)
    print("Debug: Master Columns before merging:", master_df.columns)
    
    merged_df = pd.merge(
        master_df, gha_df, 
        on=['Position ID', 'Employee ID'], 
        how='outer', 
        suffixes=('_master', '_gha')
    )
    
    # Debug column names after merging
    print("Debug: Columns in merged_df after merging:", merged_df.columns)
    
    # If columns aren't named as expected, skip further processing in this loop iteration
    if 'Employee ID_master' not in merged_df.columns or 'Employee ID_gha' not in merged_df.columns:
        print("Warning: Expected columns 'Employee ID_master' and 'Employee ID_gha' not found.")
        continue
    
    # Detect changes
    for _, row in merged_df.iterrows():
        pos_id, emp_id = row['Position ID'], row['Employee ID']
        master_emp_id, gha_emp_id = row['Employee ID_master'], row['Employee ID_gha']
        
        # Record changes in Position ID and Employee ID combinations
        if pd.notna(master_emp_id) and pd.notna(gha_emp_id) and master_emp_id != gha_emp_id:
            changes_df = changes_df.append({
                **row[['Position ID', 'Employee ID_master']],
                'Month': month,
                'Description': 'Position-Employee Combination Changed'
            }, ignore_index=True)

        # Check for additional column changes
        for col in additional_cols:
            master_value, gha_value = row.get(f"{col}_master"), row.get(f"{col}_gha")
            if pd.notna(master_value) and pd.notna(gha_value) and master_value != gha_value:
                changes_df = changes_df.append({
                    **row[['Position ID', 'Employee ID_master']],
                    'Month': month,
                    'Description': additional_cols_descriptions[col]
                }, ignore_index=True)

# Remove duplicates
changes_df.drop_duplicates(subset=['Position ID', 'Employee ID_master', 'Description'], inplace=True)

# Output the changes DataFrame
print(changes_df)


In [1]:
#option 1
import pandas as pd

# Define maximum digits for 'Position ID' formatting, based on master file creation
max_digits = max(len(str(pos_id)) for pos_id in master_df['Position ID'].astype(str))

# Define additional columns to check for changes, with corresponding descriptions
additional_cols = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name']
additional_cols_descriptions = {col: f"{col} Changed" for col in additional_cols}

# Initialize empty DataFrame to store changes
changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ['Month', 'Description'])

for month, gha_file, open_file in zip(months, gha_files, open_files):
    # Load and prepare GHA and Open Position data for the current month
    gha_df = pd.read_excel(gha_file, sheet_name='Headcount - Employee Detail')
    open_pos_df = pd.read_excel(open_file)
    
    # Ensure consistent 'Position ID' format (use zfill if needed)
    if 'Position Number' in gha_df.columns:
        gha_df.rename(columns={'Position Number': 'Position ID'}, inplace=True)
    gha_df['Position ID'] = gha_df['Position ID'].astype(str).str.zfill(max_digits)
    open_pos_df['Position ID'] = open_pos_df['Position ID'].astype(str).str.zfill(max_digits)
    
    # Filter out records where 'Position ID' or 'Employee ID' is unspecified or blank
    gha_df = gha_df[(gha_df['Position ID'] != 'unspecified') & (gha_df['Position ID'] != '')]
    gha_df = gha_df[(gha_df['Employee ID'] != 'unspecified') & (gha_df['Employee ID'] != '')]
    
    # Merge GHA data with master_df based on Position ID and Employee ID
    merged_df = pd.merge(master_df, gha_df, on=['Position ID', 'Employee ID'], how='outer', suffixes=('_master', '_gha'))

    # Identify changes in 'Position ID' and 'Employee ID' combinations, as well as additional columns
    for _, row in merged_df.iterrows():
        pos_id, emp_id = row['Position ID'], row['Employee ID']
        master_emp_id, gha_emp_id = row['Employee ID_master'], row['Employee ID_gha']
        
        # Check for changes in Position ID and Employee ID combinations
        if pd.notna(master_emp_id) and pd.notna(gha_emp_id) and master_emp_id != gha_emp_id:
            # Log change details if Position ID has a different Employee ID
            changes_df = changes_df.append({
                **row[['Position ID', 'Employee ID_master']],
                'Month': month,
                'Description': 'Position-Employee Combination Changed'
            }, ignore_index=True)

        # Check for changes in additional columns
        for col in additional_cols:
            master_value, gha_value = row[f"{col}_master"], row.get(f"{col}_gha")
            if pd.notna(master_value) and pd.notna(gha_value) and master_value != gha_value:
                changes_df = changes_df.append({
                    **row[['Position ID', 'Employee ID_master']],
                    'Month': month,
                    'Description': additional_cols_descriptions[col]
                }, ignore_index=True)

# Drop duplicates to avoid recording repeated changes in subsequent months
changes_df.drop_duplicates(subset=['Position ID', 'Employee ID_master', 'Description'], inplace=True)

# View or save changes_df to verify the output
print(changes_df)


  Position ID Employee ID  Jan 24  Feb 24  Mar 24
0      000123        E001       1       1       0
1      000456        E002       1       0       0
2      000789        E003       0       0       1


In [None]:
# option 2
import pandas as pd

# Define the list of GHA and Open Position files along with months for reference
gha_files = [...]  # List of file paths for GHA MoM files
open_pos_files = [...]  # List of file paths for Open Position MoM files
months = ["Jan 24", "Feb 24", "Mar 24", "Apr 24", ...]  # Months to track for MoM columns

# Load master file
master_df = pd.read_excel('master_file.xlsx')
master_df['Position ID'] = master_df['Position ID'].astype(str).str.zfill(6)

# Initialize a DataFrame to store changes
changes_df = pd.DataFrame(columns=master_df.columns.tolist() + ['Month', 'Description'])

# Define the additional columns to check for changes
additional_cols = ['Global Career Band', 'BF Level 4 Name', 'Work Location Country/Territory Name']

# Define a helper function to standardize Position IDs across all files
def standardize_position_ids(df):
    max_digits = master_df['Position ID'].str.len().max()
    df['Position ID'] = df['Position ID'].astype(str).str.zfill(max_digits)
    return df

# Loop through each month and each file to detect changes
for month, gha_file, open_pos_file in zip(months, gha_files, open_pos_files):
    # Load GHA and Open Position files for the current month
    gha_df = pd.read_excel(gha_file, sheet_name='Headcount - Employee Detail')
    gha_df = standardize_position_ids(gha_df)
    
    # Apply checks only if Position ID is specified
    gha_df_filtered = gha_df[gha_df['Position ID'] != 'unspecified']
    
    # Iterate through each row in gha_df_filtered to check against master_df
    for _, row in gha_df_filtered.iterrows():
        pos_id = row['Position ID']
        emp_id = row['Employee ID'] if 'Employee ID' in row else 'unspecified'

        # Find the master record for this Position ID
        master_record = master_df[master_df['Position ID'] == pos_id]
        
        if not master_record.empty:
            # Check for a change in Employee ID
            master_emp_id = master_record['Employee ID'].values[0]
            if emp_id != 'unspecified' and emp_id != master_emp_id:
                change_desc = f"Employee ID Changed from {master_emp_id} to {emp_id}"
                change_row = row.append(pd.Series({'Month': month, 'Description': change_desc}))
                changes_df = changes_df.append(change_row, ignore_index=True)

            # Check for changes in additional columns
            for col in additional_cols:
                if col in row and master_record[col].values[0] != row[col]:
                    change_desc = f"{col} Changed from {master_record[col].values[0]} to {row[col]}"
                    change_row = row.append(pd.Series({'Month': month, 'Description': change_desc}))
                    changes_df = changes_df.append(change_row, ignore_index=True)
                    
        else:
            # If no master record exists for this Position ID, it's a new addition
            change_desc = "New Position ID Entry"
            change_row = row.append(pd.Series({'Month': month, 'Description': change_desc}))
            changes_df = changes_df.append(change_row, ignore_index=True)

# Final step: Append changes_df to the master file
changes_df = changes_df[master_df.columns]  # Select only master columns
final_master_df = pd.concat([master_df, changes_df], ignore_index=True)

# Save the updated master file
final_master_df.to_excel('updated_master_file.xlsx', index=False)
