In [None]:
import pandas as pd
from pyxlsb import open_workbook

# Load the xlsb file
wb2_path = 'path_to_wb2.xlsb'  # Update with the actual path to wb2

# Open the 'HC - Actual' sheet
with open_workbook(wb2_path) as wb:
    sheet = wb.get_sheet('HC - Actual')

    # Initialize variables
    E15 = None
    F15 = None
    data = []

    # Iterate over rows and extract values
    for row in sheet.rows():
        row_index = row[0].r  # Row index starts at 1 in xlsb

        # Check and capture values in row 15
        if row_index == 15:
            E15 = row[4].v  # E15
            F15 = row[5].v  # F15

        # Capture all data starting from row 14 (header candidates and data)
        if row_index >= 14:
            data.append([item.v for item in row])

# Identify if row 14 or 15 should be headers based on E15 and F15 values
month_headers = ['M01', 'M02', 'M03', 'M04', 'M05', 'M06', 'M07', 'M08', 'M09', 'M10', 'M11', 'M12']

if E15 in month_headers and F15 in month_headers:
    header_row_index = 15  # Headers are in row 15
    # Use row 15 headers for columns A, B, C and month headers from row 15 for D onwards
    headers = data[1][:3] + data[1][3:]  # Combine headers from row 15 for D onwards
    start_data_row = 2     # Data starts from the row after headers (row 16)
else:
    header_row_index = 14  # Headers are in row 14
    # Use row 15 headers for columns A, B, C and month headers from row 14 for D onwards
    headers = data[1][:3] + data[0][3:]  # Use headers from row 15 for A, B, C, and row 14 for D onwards
    start_data_row = 2     # Data starts from row 15

# Extract the data from the relevant rows starting from row 15 (data rows)
final_data = data[start_data_row:]

# Create a DataFrame using pandas with the correct headers and data
df = pd.DataFrame(final_data, columns=headers)

# Output the DataFrame
print(df)

# Save the DataFrame to an Excel file if needed
output_path = 'final_data.xlsx'  # Update the output path as needed
df.to_excel(output_path, index=False)


In [7]:
import tkinter as tk
from tkinter import filedialog, messagebox

class ExcelSheetSelector:
    def __init__(self, root):
        self.root = root
        self.root.title("Excel Sheet Selector")

        # Custom labels for each file selection
        self.custom_labels = [
            "Select OFile:", "Select CFile:", "Select DFile:", "Select Fourth File:",
            "Select Fifth File:", "Select Sixth File:", "Select Seventh File:", "Select Eighth File:"
        ]

        # List to hold selected file paths, initially None
        self.selected_files = [None] * 8
        self.labels = []

        # Create custom labels and buttons for selecting files
        for i in range(8):
            # Create a custom label for each file selection
            custom_label = tk.Label(root, text=self.custom_labels[i], width=20, anchor='w')
            custom_label.grid(row=i, column=0, padx=10, pady=5)

            # Create a label to display the selected file path
            label = tk.Label(root, text="Not selected", width=60, anchor='w')
            label.grid(row=i, column=1, padx=10, pady=5)
            self.labels.append(label)

            # Create a button for each file selection
            button = tk.Button(root, text="Browse", command=lambda index=i: self.select_file(index))
            button.grid(row=i, column=2, padx=10, pady=5)

        # Button to confirm file selections
        confirm_button = tk.Button(root, text="Confirm Selection", command=self.confirm_selection)
        confirm_button.grid(row=8, column=0, columnspan=3, pady=20)

        # Predefined variable names for file paths
        self.OFile = None
        self.CFile = None
        self.DFile = None
        self.file4 = None
        self.file5 = None
        self.file6 = None
        self.file7 = None
        self.file8 = None

    def select_file(self, index):
        # Open file dialog to select a file
        file_path = filedialog.askopenfilename(title="Select a File", filetypes=[("All Files", "*.*")])
        
        if file_path:  # If a file is selected
            self.selected_files[index] = file_path  # Save the selected file
            self.labels[index].config(text=file_path)  # Update label with the file path

    def confirm_selection(self):
        # Check how many files have been selected
        num_selected = sum(1 for file in self.selected_files if file)

        if num_selected == 0:
            messagebox.showwarning("Warning", "No files selected.")
        else:
            # Show confirmation dialog with options to Confirm or Select More Files
            result = messagebox.askquestion(
                "Confirmation", 
                f"You have selected {num_selected} files. Do you want to proceed?", 
                icon='question'
            )
            
            if result == 'yes':
                # Directly assign file paths to predefined variables
                self.assign_file_paths()
                messagebox.showinfo("Proceed", "Files are selected. Proceeding with the application.")
                # Print assigned files for verification
                print(self.OFile, self.CFile, self.DFile, self.file4, self.file5, self.file6, self.file7, self.file8)
                # Close the GUI
                self.root.destroy()
            else:
                # Allow the user to select more files
                messagebox.showinfo("Select More", "Please select more files.")

    def assign_file_paths(self):
        """Assigns selected file paths to predefined variables."""
        self.OFile = self.selected_files[0] if len(self.selected_files) > 0 else None
        self.CFile = self.selected_files[1] if len(self.selected_files) > 1 else None
        self.DFile = self.selected_files[2] if len(self.selected_files) > 2 else None
        self.file4 = self.selected_files[3] if len(self.selected_files) > 3 else None
        self.file5 = self.selected_files[4] if len(self.selected_files) > 4 else None
        self.file6 = self.selected_files[5] if len(self.selected_files) > 5 else None
        self.file7 = self.selected_files[6] if len(self.selected_files) > 6 else None
        self.file8 = self.selected_files[7] if len(self.selected_files) > 7 else None

# Initialize the Tkinter window and ExcelSheetSelector class
if __name__ == "__main__":
    root = tk.Tk()
    app = ExcelSheetSelector(root)
    root.mainloop()


C:/Users/KS/data.xlsx None None None None None C:/Users/KS/final_output.xlsx None


In [5]:
import tkinter as tk
from tkinter import filedialog, messagebox

def select_file(index):
    # Open file dialog to select a file
    file_path = filedialog.askopenfilename(title="Select a File", filetypes=[("All Files", "*.*")])
    
    if file_path:  # If a file is selected
        selected_files[index] = file_path  # Save the selected file
        labels[index].config(text=file_path)  # Update label with the file path

def confirm_selection():
    # Check how many files have been selected
    num_selected = sum(1 for file in selected_files if file)
    
    if num_selected == 0:
        messagebox.showwarning("Warning", "No files selected.")
    else:
        # Show confirmation dialog with options to Confirm or Select More Files
        result = messagebox.askquestion("Confirmation", f"You have selected {num_selected} files. Do you want to proceed?", icon='question')
        
        if result == 'yes':
            # Proceed and close the application
            messagebox.showinfo("Proceed", "Files are selected. Proceeding with the application.")
            root.destroy()
        else:
            # Allow the user to select more files
            messagebox.showinfo("Select More", "Please select more files.")

# Initialize the Tkinter window
root = tk.Tk()
root.title("File Selector")

# List to hold selected file paths
selected_files = [None] * 8
labels = []

# Create labels and buttons for selecting files
for i in range(8):
    # Create a label for each file selection
    label = tk.Label(root, text=f"File {i+1}: Not selected", width=100, anchor='w')
    label.grid(row=i, column=0, padx=10, pady=5)
    labels.append(label)
    
    # Create a button for each file selection
    button = tk.Button(root, text="Select File", command=lambda index=i: select_file(index))
    button.grid(row=i, column=1, padx=10, pady=5)

# Button to confirm file selections
confirm_button = tk.Button(root, text="Confirm Selection", command=confirm_selection)
confirm_button.grid(row=8, column=0, columnspan=2, pady=20)

root.mainloop()


In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox

def select_files():
    # Open the file dialog to select files
    file_paths = filedialog.askopenfilenames(title="Select Files", filetypes=[("All Files", "*.*")])
    
    # Update the listbox with selected files
    listbox.delete(0, tk.END)
    for file in file_paths:
        listbox.insert(tk.END, file)
    
    # Save the selected files to the global variable
    selected_files.clear()
    selected_files.extend(file_paths)
    
    # Confirm the number of selected files
    if len(selected_files) == 0:
        messagebox.showinfo("Information", "No files selected.")
    else:
        messagebox.showinfo("Information", f"You have selected {len(selected_files)} files.")

def confirm_selection():
    # Proceed with the application logic
    if len(selected_files) > 0:
        messagebox.showinfo("Proceed", "Files are selected. Proceeding with the application.")
    else:
        messagebox.showwarning("Warning", "No files selected. Proceeding with the application.")

# Initialize the Tkinter window
root = tk.Tk()
root.title("File Selector")

selected_files = []

# Create a Listbox to display selected files
listbox = tk.Listbox(root, width=100, height=10)
listbox.pack(pady=10)

# Add buttons to select files and confirm selection
select_button = tk.Button(root, text="Select Files", command=select_files)
select_button.pack(pady=5)

confirm_button = tk.Button(root, text="Confirm Selection", command=confirm_selection)
confirm_button.pack(pady=5)

root.mainloop()


In [1]:
import pandas as pd

# Sample DataFrame
data = {
    'Record Type': ['EUC', 'EUC', 'EUC', 'Other', 'EUC'],
    'Record ID': [1, 2, 3, 4, 5],
    'Record Name': ['Name1', 'Name2', 'Name3', 'Name4', 'Name5'],
    'Staff ID': [101, 102, 102, 102, 103],
    'Position Changed': ['Left', 'No Change', 'Yes', 'Left', 'No Change']
}
df = pd.DataFrame(data)

# Define a function to concatenate control items and determine control domain
def concatenate_control_items(group):
    control_items = '\n'.join(group['Record Type'] + ' ' + group['Record ID'].astype(str) + ' ' + group['Record Name'])
    record_types = group['Record Type'].unique()
    if len(record_types) > 1:
        control_domain = 'mix'
    elif 'EUC' in record_types:
        control_domain = 'EUC only'
    else:
        control_domain = 'Other'
    return pd.Series({
        'Control Items': control_items,
        'Control Domain': control_domain
    })

# Group by 'Staff ID' and apply the function
result_df = df.groupby('Staff ID').apply(concatenate_control_items).reset_index()

# Merge the result back to the original DataFrame to maintain original rows
final_df = df.drop(columns=['Record Type', 'Record ID', 'Record Name']).drop_duplicates().merge(result_df, on='Staff ID', how='left')

# Print the final DataFrame
print(final_df)

# Save the result to an Excel file if needed
final_df.to_excel('final_output.xlsx', index=False)


   Staff ID Position Changed                            Control Items  \
0       101             Left                              EUC 1 Name1   
1       102        No Change  EUC 2 Name2\nEUC 3 Name3\nOther 4 Name4   
2       102              Yes  EUC 2 Name2\nEUC 3 Name3\nOther 4 Name4   
3       102             Left  EUC 2 Name2\nEUC 3 Name3\nOther 4 Name4   
4       103        No Change                              EUC 5 Name5   

  Control Domain  
0       EUC only  
1            mix  
2            mix  
3            mix  
4       EUC only  


In [6]:
import tkinter as tk
from tkinter import filedialog, messagebox

class ExcelSheetSelector:
    def __init__(self, root):
        self.root = root
        self.root.title("Excel Sheet Selector")

        self.file_paths = [None] * 7
        self.labels = []

        # Specific labels for each file selection
        self.label_texts = [
            "Select Humans file:",
            "Select Animals file:",
            "Select Plants file:",
            "Select Minerals file:",
            "Select Microbes file:",
            "Select Fungi file:",
            "Select Algae file:"
        ]

        # Create labels and buttons for 7 Excel sheets
        for i in range(7):
            label = tk.Label(root, text=self.label_texts[i])
            label.grid(row=i, column=0, padx=10, pady=5)
            
            button = tk.Button(root, text="Browse", command=lambda i=i: self.select_file(i))
            button.grid(row=i, column=1, padx=10, pady=5)
            
            file_label = tk.Label(root, text="No file selected")
            file_label.grid(row=i, column=2, padx=10, pady=5)
            self.labels.append(file_label)
        
        # Button to check and print selected file paths
        self.print_button = tk.Button(root, text="Confirm Selections", command=self.check_and_confirm)
        self.print_button.grid(row=7, column=0, columnspan=3, pady=10)

    def select_file(self, index):
        self.root.withdraw()  # Hide the main window
        file_path = filedialog.askopenfilename(filetypes=[("Excel files", "*.xlsx *.xls")])
        self.root.deiconify()  # Show the main window again
        if file_path:
            self.file_paths[index] = file_path
            self.labels[index].config(text=file_path)

    def check_and_confirm(self):
        if all(self.file_paths):
            self.root.destroy()  # Close the window gracefully
        else:
            messagebox.showwarning("Warning", "Please select all files before proceeding.")

# Create the main window
root = tk.Tk()
app = ExcelSheetSelector(root)
root.mainloop()

# After closing the window, capture the file paths into separate variables
humans_file, animals_file, plants_file, minerals_file, microbes_file, fungi_file, algae_file = app.file_paths

# Print the captured file paths (for verification)
print(f"Humans file: {humans_file}")
print(f"Animals file: {animals_file}")
print(f"Plants file: {plants_file}")
print(f"Minerals file: {minerals_file}")
print(f"Microbes file: {microbes_file}")
print(f"Fungi file: {fungi_file}")
print(f"Algae file: {algae_file}")


Humans file: C:/Users/KS/Start-Data-Analysis.xlsx
Animals file: C:/Users/KS/Source.xlsx
Plants file: C:/Users/KS/sample.xlsx
Minerals file: C:/Users/KS/sam.xlsx
Microbes file: C:/Users/KS/sf2_output.xlsx
Fungi file: C:/Users/KS/modified_workbook1.xlsx
Algae file: C:/Users/KS/modified_workbook.xlsx


In [1]:
import pandas as pd

# Sample data creation for demonstration
data = {
    'Record Type': ['Issue', 'Issue', 'Action', 'Issue', 'Action'],
    'Record ID': [101, 102, 103, 104, 105],
    'Record Name': ['A', 'B', 'C', 'D', 'E'],
    'Record Details': ['Detail1', 'Detail2', 'Detail3', 'Detail4', 'Detail5'],
    'Staff ID': [1, 1, 2, 1, 2],
    'Position Changed': ['Yes', 'No', 'Yes', 'No', 'Yes']
}

# Create DataFrame
df = pd.DataFrame(data)

# Define a function to concatenate the desired fields
def concatenate_records(group):
    return '\n'.join(group['Record Type'] + ' ' + group['Record ID'].astype(str) + ' ' + group['Record Name'])

# Group by 'Staff ID' and apply the concatenation function
grouped_df = df.groupby('Staff ID').apply(lambda x: pd.Series({
    'Control Items': concatenate_records(x),
    'Position Changed': x['Position Changed'].iloc[0]  # Assuming you want to keep the first 'Position Changed' value
})).reset_index()

# Optional: Include any other columns you want to keep from the original DataFrame
# For demonstration, I'm adding 'Record Details' concatenated in the same way
grouped_df['Record Details'] = df.groupby('Staff ID')['Record Details'].apply(lambda x: '\n'.join(x)).reset_index(drop=True)

# Display the final DataFrame
print(grouped_df)

# Save the final DataFrame to an Excel file
# grouped_df.to_excel('path_to_output_file.xlsx', index=False)


   Staff ID                          Control Items Position Changed  \
0         1  Issue 101 A\nIssue 102 B\nIssue 104 D              Yes   
1         2             Action 103 C\nAction 105 E              Yes   

              Record Details  
0  Detail1\nDetail2\nDetail4  
1           Detail3\nDetail5  


In [None]:
import pandas as pd

def lookup_and_merge(file1_path, file2_path, vOld_path):
    # Read the first file into a DataFrame
    df1 = pd.read_excel(file1_path)
    
    # Read the second file into a DataFrame
    df2 = pd.read_excel(file2_path)
    
    # Read the vOld file into a DataFrame
    vOld = pd.read_excel(vOld_path)
    
    # Ensure the Staff ID in both DataFrames are treated as numeric for accurate matching
    df1['Employee ID'] = pd.to_numeric(df1['Employee ID'], errors='coerce').astype('Int64')
    df2['Staff ID'] = pd.to_numeric(df2['Staff ID'], errors='coerce').astype('Int64')
    vOld['Staff ID'] = pd.to_numeric(vOld['Staff ID'], errors='coerce').astype('Int64')
    
    # Perform the lookup by merging the DataFrames on Staff ID and Employee ID
    merged_df = df2.merge(df1[['Employee ID', 'Position Changed']], 
                          left_on='Staff ID', right_on='Employee ID', 
                          how='left')
    
    # Drop the redundant 'Employee ID' column from the merged DataFrame
    merged_df.drop(columns=['Employee ID'], inplace=True)
    
    # Filter the DataFrame to retain only rows where 'Position Changed' is 'Left' or 'Yes'
    filtered_df = merged_df[merged_df['Position Changed'].isin(['Left', 'Yes'])]
    
    # Ensure 'Staff ID' and 'Position Changed' columns are filled properly
    filtered_df['Staff ID'] = filtered_df['Staff ID'].fillna('No ID')
    filtered_df['Position Changed'] = filtered_df['Position Changed'].fillna('No Change')
    
    # Perform the second lookup by merging the filtered_df with vOld on Staff ID
    final_df = filtered_df.merge(vOld[['Staff ID', 'Employee Name', 'Employee Business Email Address', 'Global Career Band']], 
                                 on='Staff ID', 
                                 how='left')
    
    # Rename the fetched columns
    final_df.rename(columns={
        'Employee Name': 'Functional Manager Employee Name',
        'Employee Business Email Address': 'Functional Manager Email',
        'Global Career Band': 'Functional Manager GCB'
    }, inplace=True)
    
    # Return the final DataFrame
    return final_df

# Paths to the Excel files
file1_path = 'path_to_file1.xlsx'  # File with Employee ID, Position Changed, etc.
file2_path = 'path_to_file2.xlsx'  # File with Record ID, Record Name, Record Details, Staff ID
vOld_path = 'path_to_vOld.xlsx'    # File with additional details based on Staff ID

# Perform the lookup, merge, and filter
final_df = lookup_and_merge(file1_path, file2_path, vOld_path)

# Display the final DataFrame
print(final_df)

# Save the final DataFrame to a new Excel file
final_df.to_excel('path_to_output_file.xlsx', index=False)


In [None]:
import pandas as pd

def lookup_and_merge(file1_path, file2_path):
    # Read the first file into a DataFrame
    df1 = pd.read_excel(file1_path)
    
    # Read the second file into a DataFrame
    df2 = pd.read_excel(file2_path)
    
    # Ensure the Staff ID in both DataFrames are strings for accurate matching
    df1['Employee ID'] = df1['Employee ID'].astype(str)
    df2['Staff ID'] = df2['Staff ID'].astype(str)
    
    # Perform the lookup by merging the DataFrames on Staff ID and Employee ID
    merged_df = df2.merge(df1[['Employee ID', 'Position Changed']], 
                          left_on='Staff ID', right_on='Employee ID', 
                          how='left')
    
    # Drop the redundant 'Employee ID' column from the merged DataFrame
    merged_df.drop(columns=['Employee ID'], inplace=True)
    
    # Return the merged DataFrame
    return merged_df

# Paths to the Excel files
file1_path = 'path_to_file1.xlsx'  # File with Employee ID, Position Changed, etc.
file2_path = 'path_to_file2.xlsx'  # File with Record ID, Record Name, Record Details, Staff ID

# Perform the lookup and merge
final_df = lookup_and_merge(file1_path, file2_path)

# Display the final DataFrame
print(final_df)

# If needed, save the final DataFrame to a new Excel file
# final_df.to_excel('path_to_output_file.xlsx', index=False)


In [None]:
import pandas as pd

def process_files(vnew_path, vold_path):
    # Read the Excel files into DataFrames
    vNew = pd.read_excel(vnew_path)
    vOld = pd.read_excel(vold_path)
    
    # Check and print initial dtypes of columns
    print("Initial dtypes of vNew:")
    print(vNew.dtypes)
    print("\nInitial dtypes of vOld:")
    print(vOld.dtypes)
    
    # Function to convert columns to int, skipping invalid rows
    def convert_to_int(df, column):
        df[column] = pd.to_numeric(df[column], errors='coerce')
        df.dropna(subset=[column], inplace=True)
        df[column] = df[column].astype(int)
        return df

    # Convert 'Employee ID' and 'Position Number' to integers
    vNew = convert_to_int(vNew, 'Employee ID')
    vNew = convert_to_int(vNew, 'Position Number')
    vOld = convert_to_int(vOld, 'Employee ID')
    vOld = convert_to_int(vOld, 'Position Number')
    
    # Check and print dtypes of columns after coercion attempt
    print("\nDtypes of vNew after coercion attempt:")
    print(vNew.dtypes)
    print("\nDtypes of vOld after coercion attempt:")
    print(vOld.dtypes)
    
    # Select necessary columns from vNew
    vNew_selected = vNew[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vNew_selected.rename(columns={
        'Position Number': 'new Position Number',
        'BF Level 4': 'new BF Level 4'
    }, inplace=True)
    
    # Select necessary columns from vOld
    vOld_selected = vOld[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vOld_selected.rename(columns={
        'Position Number': 'old Position Number',
        'BF Level 4': 'old BF Level 4'
    }, inplace=True)
    
    # Merge the DataFrames on Employee ID using left join
    merged_df = pd.merge(vOld_selected, vNew_selected, on='Employee ID', how='left')
    
    # Create the Position Changed column based on the specified conditions
    def determine_position_changed(row):
        if pd.isna(row['new Position Number']) and pd.isna(row['new BF Level 4']):
            return 'Left'
        elif (row['old Position Number'] != row['new Position Number']) or (row['old BF Level 4'] != row['new BF Level 4']):
            return 'Yes'
        else:
            return 'No Change'
    
    merged_df['Position Changed'] = merged_df.apply(determine_position_changed, axis=1)
    
    return merged_df

# Example usage
vnew_path = 'path_to_vNew.xlsx'
vold_path = 'path_to_vOld.xlsx'

final_df = process_files(vnew_path, vold_path)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd

def process_files(vnew_path, vold_path):
    # Read the Excel files into DataFrames
    vNew = pd.read_excel(vnew_path)
    vOld = pd.read_excel(vold_path)
    
    # Select necessary columns from vNew
    vNew_selected = vNew[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vNew_selected.rename(columns={
        'Position Number': 'new Position Number',
        'BF Level 4': 'new BF Level 4'
    }, inplace=True)
    
    # Select necessary columns from vOld
    vOld_selected = vOld[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vOld_selected.rename(columns={
        'Position Number': 'old Position Number',
        'BF Level 4': 'old BF Level 4'
    }, inplace=True)
    
    # Merge the DataFrames on Employee ID using left join
    merged_df = pd.merge(vOld_selected, vNew_selected, on='Employee ID', how='left')
    
    # Create the Position Changed column based on the specified conditions
    def determine_position_changed(row):
        if pd.isna(row['new Position Number']) and pd.isna(row['new BF Level 4']):
            return 'Left'
        elif (row['old Position Number'] != row['new Position Number']) or (row['old BF Level 4'] != row['new BF Level 4']):
            return 'Yes'
        else:
            return 'No Change'
    
    merged_df['Position Changed'] = merged_df.apply(determine_position_changed, axis=1)
    
    return merged_df

# Example usage
vnew_path = 'path_to_vNew.xlsx'
vold_path = 'path_to_vOld.xlsx'

final_df = process_files(vnew_path, vold_path)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd

def process_files(vnew_path, vold_path):
    # Read the Excel files into DataFrames
    vNew = pd.read_excel(vnew_path)
    vOld = pd.read_excel(vold_path)
    
    # Select necessary columns from vNew
    vNew_selected = vNew[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vNew_selected.rename(columns={
        'Position Number': 'new Position Number',
        'BF Level 4': 'new BF Level 4'
    }, inplace=True)
    
    # Select necessary columns from vOld
    vOld_selected = vOld[['Employee ID', 'Position Number', 'BF Level 4']].copy()
    vOld_selected.rename(columns={
        'Position Number': 'old Position Number',
        'BF Level 4': 'old BF Level 4'
    }, inplace=True)
    
    # Merge the DataFrames on Employee ID
    merged_df = pd.merge(vOld_selected, vNew_selected, on='Employee ID', how='outer')
    
    # Create the Position Changed column based on the specified conditions
    def determine_position_changed(row):
        if pd.isna(row['new Position Number']) and pd.isna(row['new BF Level 4']):
            return 'Left'
        elif (row['old Position Number'] != row['new Position Number']) or (row['old BF Level 4'] != row['new BF Level 4']):
            return 'Yes'
        else:
            return 'No Change'
    
    merged_df['Position Changed'] = merged_df.apply(determine_position_changed, axis=1)
    
    return merged_df

# Example usage
vnew_path = 'path_to_vNew.xlsx'
vold_path = 'path_to_vOld.xlsx'

final_df = process_files(vnew_path, vold_path)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd
import re

def extract_staff_id(action_owner_col):
    """Extract Staff ID from the given column using regex."""
    if isinstance(action_owner_col, str):
        match = re.search(r'\((\d{8})\)', action_owner_col)
        return match.group(1) if match else None
    return None

def process_single_file(file_path, staff_id_col, issue_status_col, filter_condition,
                        record_id_col, record_name_col, record_details_col, record_type_value):
    """
    Process a single file to extract relevant data and transform it into a standardized format.
    
    Parameters:
        file_path (str): Path to the input Excel file.
        staff_id_col (str): Column name for the Staff ID extraction.
        issue_status_col (str): Column name for filtering records.
        filter_condition (str): Value to filter the records (e.g., 'open').
        record_id_col (str): Column name for 'Record ID'.
        record_name_col (str): Column name for 'Record Name'.
        record_details_col (str): Column name for 'Record Details'.
        record_type_value (str): Value to be placed in 'Record Type' column.
    
    Returns:
        pd.DataFrame: Processed DataFrame with standardized columns.
    """
    # Load the Excel file into a DataFrame
    df = pd.read_excel(file_path)
    
    # Ensure the staff ID column is treated as string
    df[staff_id_col] = df[staff_id_col].astype(str)
    
    # Filter the DataFrame based on the provided condition
    filtered_df = df[df[issue_status_col] == filter_condition]
    
    # Extract Staff ID from the specified column
    filtered_df['Staff ID'] = filtered_df[staff_id_col].apply(extract_staff_id)
    
    # Add the 'Record Type' column with the specified value
    filtered_df['Record Type'] = record_type_value
    
    # Create the final DataFrame with the required columns
    result_df = filtered_df[[record_id_col, record_name_col, record_details_col]].copy()
    result_df.rename(columns={
        record_id_col: 'Record ID',
        record_name_col: 'Record Name',
        record_details_col: 'Record Details'
    }, inplace=True)
    
    # Add 'Staff ID' and 'Record Type' columns
    result_df['Staff ID'] = filtered_df['Staff ID']
    result_df['Record Type'] = filtered_df['Record Type']
    
    # Return the result DataFrame
    return result_df

def concatenate_files(file_params_list):
    """
    Process multiple files and concatenate the results into a single DataFrame.
    
    Parameters:
        file_params_list (list of dict): List of dictionaries with parameters for each file.
    
    Returns:
        pd.DataFrame: Concatenated DataFrame with results from all files.
    """
    all_dfs = []
    
    for params in file_params_list:
        df = process_single_file(
            file_path=params['file_path'],
            staff_id_col=params['staff_id_col'],
            issue_status_col=params['issue_status_col'],
            filter_condition=params['filter_condition'],
            record_id_col=params['record_id_col'],
            record_name_col=params['record_name_col'],
            record_details_col=params['record_details_col'],
            record_type_value=params['record_type_value']
        )
        all_dfs.append(df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    return final_df

# Example usage
file_params_list = [
    {
        'file_path': 'path_to_file1.xlsx',
        'staff_id_col': 'Action Owner',
        'issue_status_col': 'Issue Status',
        'filter_condition': 'open',
        'record_id_col': 'Issue ID',
        'record_name_col': 'Issue Title',
        'record_details_col': 'Issue Description',
        'record_type_value': 'Issue'
    },
    {
        'file_path': 'path_to_file2.xlsx',
        'staff_id_col': 'Owner',
        'issue_status_col': 'Status',
        'filter_condition': 'open',
        'record_id_col': 'ID',
        'record_name_col': 'Title',
        'record_details_col': 'Description',
        'record_type_value': 'Action Owner'
    },
    # Add parameters for other files here
]

# Process multiple files and get the final DataFrame
final_df = concatenate_files(file_params_list)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd
import re

def extract_staff_id(action_owner_col):
    """Extract Staff ID from the given column using regex."""
    if isinstance(action_owner_col, str):
        match = re.search(r'\((\d{8})\)', action_owner_col)
        return match.group(1) if match else None
    return None

def process_file(file_path, staff_id_col, issue_status_col, filter_condition,
                 record_id_col, record_name_col, record_details_col):
    """
    Process a single file to extract relevant data and transform it into a standardized format.
    
    Parameters:
        file_path (str): Path to the input Excel file.
        staff_id_col (str): Column name for the Staff ID extraction.
        issue_status_col (str): Column name for filtering records.
        filter_condition (str): Value to filter the records (e.g., 'open').
        record_id_col (str): Column name for 'Record ID'.
        record_name_col (str): Column name for 'Record Name'.
        record_details_col (str): Column name for 'Record Details'.
    
    Returns:
        pd.DataFrame: Processed DataFrame with standardized columns.
    """
    # Load the Excel file into a DataFrame
    df = pd.read_excel(file_path)
    
    # Ensure the staff ID column is treated as string
    df[staff_id_col] = df[staff_id_col].astype(str)
    
    # Filter the DataFrame based on the provided condition
    filtered_df = df[df[issue_status_col] == filter_condition]
    
    # Extract Staff ID from the specified column
    filtered_df['Staff ID'] = filtered_df[staff_id_col].apply(extract_staff_id)
    
    # Add the 'Record Type' column with the value 'Issue'
    filtered_df['Record Type'] = 'Issue'
    
    # Create the final DataFrame with the required columns
    result_df = filtered_df[[staff_id_col, record_id_col, record_name_col, record_details_col]].copy()
    result_df.rename(columns={
        staff_id_col: 'Staff ID',
        record_id_col: 'Record ID',
        record_name_col: 'Record Name',
        record_details_col: 'Record Details'
    }, inplace=True)
    
    # Add the 'Record Type' column
    result_df['Record Type'] = 'Issue'
    
    # Return the result DataFrame
    return result_df

def process_multiple_files(file_paths, column_mapping, filter_condition):
    """
    Process multiple files and concatenate the results into a single DataFrame.
    
    Parameters:
        file_paths (list of str): List of paths to input Excel files.
        column_mapping (list of dict): List of dictionaries with column mappings for each file.
        filter_condition (str): Value to filter records (e.g., 'open').
    
    Returns:
        pd.DataFrame: Concatenated DataFrame with results from all files.
    """
    all_dfs = []
    
    for file_path, columns in zip(file_paths, column_mapping):
        df = process_file(
            file_path,
            staff_id_col=columns['staff_id'],
            issue_status_col=columns['issue_status'],
            filter_condition=filter_condition,
            record_id_col=columns['record_id'],
            record_name_col=columns['record_name'],
            record_details_col=columns['record_details']
        )
        all_dfs.append(df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(all_dfs, ignore_index=True)
    
    return final_df

# Example usage
file_paths = [
    'path_to_file1.xlsx',
    'path_to_file2.xlsx',
    'path_to_file3.xlsx',
    'path_to_file4.xlsx',
    'path_to_file5.xlsx'
]

column_mapping = [
    {
        'staff_id': 'Action Owner',  # Column name to extract Staff ID
        'issue_status': 'Issue Status',  # Column name for issue status
        'record_id': 'Issue ID',  # Column name for Record ID
        'record_name': 'Issue Title',  # Column name for Record Name
        'record_details': 'Issue Description'  # Column name for Record Details
    },
    # Add mappings for other files here
]

filter_condition = 'open'  # Condition to filter records

# Process multiple files and get the final DataFrame
final_df = process_multiple_files(file_paths, column_mapping, filter_condition)

# Display the final DataFrame
print(final_df)


In [None]:
import pandas as pd
import re

# Function to extract Staff ID using regular expression
def extract_staff_id(action_owner):
    if isinstance(action_owner, str):
        match = re.search(r'\((\d{8})\)', action_owner)
        return match.group(1) if match else None
    return None

# Load the Excel file into a DataFrame
df = pd.read_excel('path_to_your_file.xlsx')

# Ensure "Action Owner" column is treated as string
df['Action Owner'] = df['Action Owner'].astype(str)

# Filter the DataFrame for rows where 'Issue Status' is 'open'
open_issues_df = df[df['Issue Status'] == 'open']

# Extract Staff ID from "Action Owner" column for filtered rows
open_issues_df['Staff ID'] = open_issues_df['Action Owner'].apply(extract_staff_id)

# Add the 'Record Type' column with value 'Issue'
open_issues_df['Record Type'] = 'Issue'

# Create the final DataFrame with the required columns
result_df = open_issues_df[['Staff ID', 'Record Type', 'Issue ID', 'Issue Title', 'Issue Description']]
result_df.rename(columns={
    'Issue ID': 'Record ID',
    'Issue Title': 'Record Name',
    'Issue Description': 'Record Details'
}, inplace=True)

# Display the final DataFrame
print(result_df)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Select relevant columns from vNew and vOld
vNew = vNew[['Employee ID', 'Position Number', 'Level4']]
vOld = vOld[['Employee ID', 'Position Number', 'Level4']]

# Merge vNew and vOld on Employee ID
merged_df = pd.merge(vNew, vOld, on='Employee ID', suffixes=('_new', '_old'), how='left')

# Rename columns to match the required format
merged_df.rename(columns={
    'Position Number_new': 'new Position Number',
    'Level4_new': 'new Level4',
    'Position Number_old': 'old Position Number',
    'Level4_old': 'old Level4'
}, inplace=True)

# Create the "Position Changed" column
merged_df['Position Changed'] = merged_df.apply(
    lambda row: 'New' if pd.isna(row['old Position Number']) or pd.isna(row['old Level4']) else 'No Change', axis=1)

# Fetch records from vOld where Employee ID is not in vNew
not_in_vNew = vOld[~vOld['Employee ID'].isin(vNew['Employee ID'])]

# Rename columns to match the format of merged_df
not_in_vNew.rename(columns={
    'Position Number': 'old Position Number',
    'Level4': 'old Level4'
}, inplace=True)

# Add new columns with default values for records not in vNew
not_in_vNew['new Position Number'] = None
not_in_vNew['new Level4'] = None
not_in_vNew['Position Changed'] = 'Left'

# Reorder columns to match the merged_df structure
not_in_vNew = not_in_vNew[['Employee ID', 'new Position Number', 'new Level4', 'old Position Number', 'old Level4', 'Position Changed']]

# Concatenate the two DataFrames
final_df = pd.concat([merged_df, not_in_vNew], ignore_index=True)

# Display the final DataFrame
print(final_df)


# Display the final DataFrame
print(merged_df)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Select relevant columns from vNew and vOld
vNew = vNew[['Employee ID', 'Position Number', 'Level4']]
vOld = vOld[['Employee ID', 'Position Number', 'Level4']]

# Merge vNew and vOld on Employee ID
merged_df = pd.merge(vNew, vOld, on='Employee ID', suffixes=('_new', '_old'), how='left')

# Rename columns to match the required format
merged_df.rename(columns={'Position Number_new': 'new Position Number',
                          'Level4_new': 'new Level4',
                          'Position Number_old': 'old Position Number',
                          'Level4_old': 'old Level4'}, inplace=True)

# Display the final DataFrame
print(merged_df)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Ensure column names are properly stripped of leading/trailing spaces
vNew.columns = vNew.columns.str.strip()
vOld.columns = vOld.columns.str.strip()

# Initialize new columns in vOld with default value 'left'
vOld['Pos Check'] = 'left'
vOld['BF4 Check'] = 'left'
vOld['Country Check'] = 'left'

# Perform the lookup and update the values for matching Employee Ids
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Pos Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Position Number'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'BF4 Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['BF Level 4 Name'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Country Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Work Location Country/Territory Name'])

# Create the 'In Scope' column
vOld['In Scope'] = ((vOld['Pos Check'] != vOld['Position Number']) | 
                    (vOld['BF4 Check'] != vOld['BF Level 4 Name']) | 
                    (vOld['Country Check'] != vOld['Work Location Country/Territory Name'])).apply(lambda x: 'Movement' if x else 'No Movement')

# Create 'Position Changed', 'BF Changed', and 'Country Changed' columns
vOld['Position Changed'] = vOld.apply(lambda row: 'Left' if row['Pos Check'] == 'left' else ('Yes' if row['Pos Check'] != row['Position Number'] else 'No'), axis=1)
vOld['BF Changed'] = vOld.apply(lambda row: 'Left' if row['BF4 Check'] == 'left' else ('Yes' if row['BF4 Check'] != row['BF Level 4 Name'] else 'No'), axis=1)
vOld['Country Changed'] = vOld.apply(lambda row: 'Left' if row['Country Check'] == 'left' else ('Yes' if row['Country Check'] != row['Work Location Country/Territory Name'] else 'No'), axis=1)

# Display the updated vOld DataFrame
print(vOld)


In [None]:
import pandas as pd

# Read the Excel files into DataFrames
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Ensure column names are properly stripped of leading/trailing spaces
vNew.columns = vNew.columns.str.strip()
vOld.columns = vOld.columns.str.strip()

# Initialize new columns in vOld with default value 'left'
vOld['Pos Check'] = 'left'
vOld['BF4 Check'] = 'left'
vOld['Country Check'] = 'left'

# Perform the lookup and update the values for matching Employee Ids
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Pos Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Position Number'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'BF4 Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['BF Level 4 Name'])
vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Country Check'] = vOld.loc[vOld['Employee ID'].isin(vNew['Employee ID']), 'Employee ID'].map(vNew.set_index('Employee ID')['Work Location Country/Territory Name'])

# Display the updated vOld DataFrame
print(vOld)


In [None]:
import pandas as pd

# Read the new and old data from the Excel workbooks
vNew = pd.read_excel('path_to_vNew.xlsx')
vOld = pd.read_excel('path_to_vOld.xlsx')

# Select the required columns
vNew = vNew[['Employee ID', 'Position Number', 'Level4']]
vOld = vOld[['Employee ID', 'Position Number', 'Level4']]

# Merge the dataframes on Employee ID
merged_df = pd.merge(vNew, vOld, on='Employee ID', how='outer', suffixes=('_new', '_old'))

# Create the Status column
merged_df['Status'] = merged_df.apply(
    lambda row: 'new' if pd.isna(row['Position Number_old']) and not pd.isna(row['Position Number_new']) else 
                ('left' if pd.isna(row['Position Number_new']) and not pd.isna(row['Position Number_old']) else 
                'existing'), axis=1)

# Save the resulting dataframe to a new Excel file
merged_df.to_excel('path_to_output.xlsx', index=False)


In [None]:
import pandas as pd
from openpyxl import Workbook, load_workbook

# Read the Excel file
file_path = 'your_excel_file.xlsx'  # Replace with your file path
output_file_path = 'grouped_data_with_subtables.xlsx'  # Replace with your desired output file path
excel_data = pd.ExcelFile(file_path)

# Create a Pandas Excel writer object
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    for sheet_name in excel_data.sheet_names:
        # Read each sheet into a DataFrame
        df = pd.read_excel(excel_data, sheet_name=sheet_name)

        # Group the data by 'Mapped L3' and 'Role_temp'
        grouped = df.groupby(['Mapped L3', 'Role_temp'])

        # Count the occurrences of 'High' and 'Low' in the 'High/Low' column
        result = grouped['High/Low'].value_counts().unstack(fill_value=0).fillna(0)

        # Add totals for each row ('Mapped L3', 'Role_temp')
        result['Total'] = result.sum(axis=1)

        # Add a totals row
        total_row = result.sum(axis=0).to_frame().T
        total_row.index = pd.MultiIndex.from_tuples([('Total', '')])
        result = pd.concat([result, total_row])

        # Reset the index to get 'Mapped L3' and 'Role_temp' as columns
        result.reset_index(inplace=True)

        # Ensure 'High' and 'Low' are present, if not add them with default 0 values
        if 'High' not in result.columns:
            result['High'] = 0
        if 'Low' not in result.columns:
            result['Low'] = 0
        result = result[['Mapped L3', 'Role_temp', 'High', 'Low', 'Total']]

        # Write the grouped data to the sheet
        result.to_excel(writer, sheet_name=sheet_name, startrow=0, index=False)
        
        # Load the workbook and access the worksheet
        workbook = writer.book
        worksheet = workbook[sheet_name]

        # Initialize start row for writing subtables
        start_row = len(result) + 2  # Move to the next row after grouped data

        # Create a list to store all subtables
        subtables_list = []

        # Create subtables for each unique 'Mapped L3'
        unique_mapped_l3 = df['Mapped L3'].unique()
        for mapped_l3 in unique_mapped_l3:
            # Skip 'Total' rows if present
            if mapped_l3 == 'Total':
                continue

            # Filter the original DataFrame based on 'Mapped L3'
            mapped_l3_data = df[df['Mapped L3'] == mapped_l3]

            # Insert a heading for unique 'Mapped L3'
            mapped_l3_heading = f"Unique L3 Value: {mapped_l3}"
            subtables_list.append((mapped_l3_heading, None))  # Append heading to list

            # Iterate over each unique 'Role_temp' for the current 'Mapped L3'
            unique_role_temp = mapped_l3_data['Role_temp'].unique()
            for role_temp in unique_role_temp:
                # Filter the data for the current 'Mapped L3' and 'Role_temp'
                subtable_data = mapped_l3_data[mapped_l3_data['Role_temp'] == role_temp]

                # Insert a heading for 'Role_temp'
                role_temp_heading = f"Role Type: {role_temp}"
                subtables_list.append((role_temp_heading, None))  # Append heading to list

                # Append the subtable data to the list
                subtables_list.append((None, subtable_data))

        # Write subtables to Excel with proper gaps
        for item in subtables_list:
            if item[0]:  # If it's a heading
                worksheet.cell(row=start_row, column=1, value=item[0])
                start_row += 2  # 2-line gap before next heading or subtable
            elif item[1] is not None:  # If it's subtable data
                item[1].to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False, header=True)
                start_row += len(item[1]) + 1  # 1-line gap after subtable

print(f"Grouped data with subtables has been saved to {output_file_path}")


In [None]:
import pandas as pd

# Read the Excel file
file_path = 'your_excel_file.xlsx'  # Replace with your file path
output_file_path = 'grouped_data_with_subtables.xlsx'  # Replace with your desired output file path
excel_data = pd.ExcelFile(file_path)

# Create a Pandas Excel writer object
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    for sheet_name in excel_data.sheet_names:
        # Read each sheet into a DataFrame
        df = pd.read_excel(excel_data, sheet_name=sheet_name)

        # Group the data by 'Mapped L3' and 'Role_temp'
        grouped = df.groupby(['Mapped L3', 'Role_temp'])

        # Count the occurrences of 'High' and 'Low' in the 'High/Low' column
        result = grouped['High/Low'].value_counts().unstack(fill_value=0).fillna(0)

        # Add totals for each row ('Mapped L3', 'Role_temp')
        result['Total'] = result.sum(axis=1)

        # Add a totals row
        total_row = result.sum(axis=0).to_frame().T
        total_row.index = pd.MultiIndex.from_tuples([('Total', '')])
        result = pd.concat([result, total_row])

        # Reset the index to get 'Mapped L3' and 'Role_temp' as columns
        result.reset_index(inplace=True)

        # Ensure 'High' and 'Low' are present, if not add them with default 0 values
        if 'High' not in result.columns:
            result['High'] = 0
        if 'Low' not in result.columns:
            result['Low'] = 0
        result = result[['Mapped L3', 'Role_temp', 'Total', 'High', 'Low']]

        # Initialize start row for writing grouped data
        start_row = 0

        # Write the grouped data to the sheet
        result.to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False)
        start_row += len(result) + 2  # Move to the next row after grouped data

        # Create subtables for each unique 'Mapped L3'
        unique_mapped_l3 = df['Mapped L3'].unique()
        for mapped_l3 in unique_mapped_l3:
            # Skip 'Total' rows if present
            if mapped_l3 == 'Total':
                continue

            # Filter the original DataFrame based on 'Mapped L3'
            mapped_l3_data = df[df['Mapped L3'] == mapped_l3]

            # Insert a heading for unique 'Mapped L3'
            mapped_l3_heading = f"Unique L3 Value: {mapped_l3}"
            worksheet = writer.sheets[sheet_name]
            worksheet.cell(row=start_row, column=1, value=mapped_l3_heading)

            # Move to the next row after writing unique L3 heading
            start_row += 2

            # Iterate over each unique 'Role_temp' for the current 'Mapped L3'
            unique_role_temp = mapped_l3_data['Role_temp'].unique()
            for role_temp in unique_role_temp:
                # Filter the data for the current 'Mapped L3' and 'Role_temp'
                subtable_data = mapped_l3_data[mapped_l3_data['Role_temp'] == role_temp]

                # Insert a heading for 'Role_temp'
                role_temp_heading = f"Role Type: {role_temp}"
                worksheet.cell(row=start_row, column=1, value=role_temp_heading)

                # Move to the next row after writing role_temp heading
                start_row += 2

                # Insert an empty row to separate headings from subtable data
                worksheet.cell(row=start_row, column=1, value="")

                # Move to the next row after inserting empty row
                start_row += 1

                # Write the subtable data to the sheet
                subtable_data.to_excel(writer, sheet_name=sheet_name, startrow=start_row, index=False)

                # Move to the next row after writing subtable data
                start_row += len(subtable_data) + 2  # Add extra space after each subtable

print(f"Grouped data with subtables has been saved to {output_file_path}")


In [12]:
import pandas as pd

# Sample data for df1 with 30 columns (only a few shown here for simplicity)
data1 = {
    'colA': [1, 2, 3, 4, 7],
    'colB': [10, 20, 30, 40, 50],
    'colS': [None, None, None, None, None]  # Initially None or some default value
    # Add other columns as needed
}
df1 = pd.DataFrame(data1)

# Sample data for df2 with 45 columns (only a few shown here for simplicity)
data2 = {
    'colR': [3, 3, 4, 5, 6, 6],
    'colS': ['a', 'b', 'c', 'd', 'e', 'f'],
    'colPM': [7, 8, 9, 10, 11, 12]
    # Add other columns as needed
}
df2 = pd.DataFrame(data2)

# Ensure the data types of colA, colR, and colPM are the same
df1['colA'] = df1['colA'].astype(str)
df2['colR'] = df2['colR'].astype(str)
df2['colPM'] = df2['colPM'].astype(str)

# Initial merge using colA and colR
df_merged_initial = pd.merge(df1, df2[['colR', 'colS']], left_on='colA', right_on='colR', how='left')

# Update df1's colS with the values from initial merge
df1['colS'] = df_merged_initial['colS_y']

# Identify rows where colS is still NaN
mask = df1['colS'].isna()

# Perform the second merge using colA and colPM for the NaN rows
df_merged_second = pd.merge(df1[mask], df2[['colPM', 'colS']], left_on='colA', right_on='colPM', how='left')

# Update colS in df1 for the NaN rows with values from the second merge
df1.loc[mask, 'colS'] = df_merged_second['colS_y']

print(df1)


  colA  colB colS
0    1    10  NaN
1    2    20  NaN
2    3    30    a
3    4    40    b
4    7    50    c


In [14]:
import pandas as pd

# Sample data for df1 with 30 columns (only a few shown here for simplicity)
data1 = {
    'colA': [1, 2, 3, 4, 7],
    'colB': [10, 20, 30, 40, 50],
    'colS': [None, '', ' ', '   ', '']  # Different types of empty or blank values
    # Add other columns as needed
}
df1 = pd.DataFrame(data1)

# Sample data for df2 with 45 columns (only a few shown here for simplicity)
data2 = {
    'colR': [3, 3, 4, 5, 6, 6],
    'colS': ['a', 'b', 'c', 'd', 'e', 'f'],
    'colPM': [7, 8, 9, 10, 11, 12]
    # Add other columns as needed
}
df2 = pd.DataFrame(data2)

# Ensure the data types of colA, colR, and colPM are the same
df1['colA'] = df1['colA'].astype(str)
df2['colR'] = df2['colR'].astype(str)
df2['colPM'] = df2['colPM'].astype(str)

# Initial merge using colA and colR
df_merged_initial = pd.merge(df1, df2[['colR', 'colS']], left_on='colA', right_on='colR', how='left')

# Update df1's colS with the values from initial merge
df1['colS'] = df_merged_initial['colS_y']

# Identify rows where colS is empty or blank
mask = df1['colS'].apply(lambda x: x == '' or x.isspace() if isinstance(x, str) else False)

# Perform the second merge using colA and colPM for the empty or blank rows
df_merged_second = pd.merge(df1[mask], df2[['colPM', 'colS']], left_on='colA', right_on='colPM', how='left')

# Update colS in df1 for the empty or blank rows with values from the second merge
df1.loc[mask, 'colS'] = df_merged_second['colS_y']

print(df1)


  colA  colB colS
0    1    10  NaN
1    2    20  NaN
2    3    30    a
3    4    40    b
4    7    50    c


In [6]:
df_merged_pm

Unnamed: 0,colA,colB,colS_x,colPM,colS_y
0,1,10,,,
1,2,20,,,
2,7,50,,7.0,a


In [1]:
import pandas as pd

# Sample data for df1 with 30 columns (only a few shown here for simplicity)
data1 = {
    'colA': [1, 2, 3, 4],
    'colB': [10, 20, 30, 40],
    # Add other columns as needed
}
df1 = pd.DataFrame(data1)

# Sample data for df2 with 45 columns (only a few shown here for simplicity)
data2 = {
    'colR': [3, 3, 4, 5, 6, 6],
    'colS': ['a', 'b', 'c', 'd', 'e', 'f'],
    # Add other columns as needed
}
df2 = pd.DataFrame(data2)

# Ensure the data types of colA and colR are the same
df1['colA'] = df1['colA'].astype(str)
df2['colR'] = df2['colR'].astype(str)

# Select only the required columns from df2
df2_selected = df2[['colR', 'colS']].drop_duplicates(subset='colR')

# Merge df1 with df2_selected on colA and colR
df_merged = pd.merge(df1, df2_selected, left_on='colA', right_on='colR', how='left')

# Assign the values to the new column in df1 and drop any extra columns
df1['colS1'] = df_merged['colS']

# Drop the extra merge column if needed (not strictly necessary, but clean)
df1.drop(columns=['colR'], inplace=True, errors='ignore')

print(df1)


  colA  colB colS1
0    1    10   NaN
1    2    20   NaN
2    3    30     a
3    4    40     c


In [1]:
import pandas as pd

# Sample data for df1
data1 = {'colA': [1, 2, 3, 4]}
df1 = pd.DataFrame(data1)

# Sample data for df2 with non-unique colR values
data2 = {'colR': [3, 3, 4, 5, 6, 6], 'colS': ['a', 'b', 'c', 'd', 'e', 'f']}
df2 = pd.DataFrame(data2)

# Merge df1 with df2 on the condition that df1['colA'] matches df2['colR']
df_merged = pd.merge(df1, df2, left_on='colA', right_on='colR', how='left')

# Select only relevant columns and rename them
df1['colS1'] = df_merged['colS']

print(df1)


   colA colS1
0     1   NaN
1     2   NaN
2     3     a
3     4     b


In [2]:
import pandas as pd

# Sample DataFrames
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'colA': ['A1', 'A2', 'A3', 'A4', 'A5'],
    'colB': ['B1', 'B2', 'B3', 'B4', 'B5'],
    'colC': ['C1', 'C2', 'C3', 'C4', 'C5']
})

df2 = pd.DataFrame({
    'id': [6, 7, 8],
    'colX': ['X6', 'X7', 'X8'],
    'colY': ['Y6', 'Y7', 'Y8'],
    'colZ': ['Z6', 'Z7', 'Z8']
})

# Columns to copy from df1 and their corresponding columns in df2
columns_to_copy = {
    'id': 'id',
    'colB': 'colY',
    'colC': 'colZ'
}

# Create a new DataFrame with the selected columns from df1
new_rows = df1[list(columns_to_copy.keys())].copy()

# Rename the columns in the new DataFrame to match the column names in df2
new_rows.rename(columns=columns_to_copy, inplace=True)

# Append the new DataFrame to df2
df2 = df2.append(new_rows, ignore_index=True)

# Display the updated df2
print(df2)


   id colX colY colZ
0   6   X6   Y6   Z6
1   7   X7   Y7   Z7
2   8   X8   Y8   Z8
3   1  NaN   B1   C1
4   2  NaN   B2   C2
5   3  NaN   B3   C3
6   4  NaN   B4   C4
7   5  NaN   B5   C5
