In [0]:
#Final code for removing duplicates  set - 3

import os
import pandas as pd

def find_last_level_folders(main_folder):
    """
    Identifies all last-level (deepest) subfolders that contain files but no further subdirectories.
    
    :param main_folder: Main directory path to scan.
    :return: List of last-level folder paths.
    """
    last_level_folders = []

    for root, dirs, files in os.walk(main_folder):
        if not dirs and files:  # If there are files but no subdirectories, it's a last-level folder
            last_level_folders.append(root)

    return last_level_folders

def remove_duplicates_from_last_level_folders(main_folder):
    """
    Processes Excel files only in the last-level subfolders, removes duplicate rows 
    (excluding the last column from the comparison), and saves the unique data back to the same file.
    
    :param main_folder: Main directory path to scan.
    """
    print("üîç Identifying last-level folders...\n")
    
    last_level_folders = find_last_level_folders(main_folder)
    
    if not last_level_folders:
        print("‚ö†Ô∏è No last-level folders found! Exiting...")
        return

    print(f"‚úÖ Found {len(last_level_folders)} last-level folders to process.\n")

    for folder in last_level_folders:
        print(f"üìÇ Processing folder: {folder}")
        
        excel_files = [file for file in os.listdir(folder) if file.endswith(('.xlsx', '.xls'))]

        for excel_file in excel_files:
            file_path = os.path.join(folder, excel_file)
            print(f"üìÑ Processing: {file_path}")

            try:
                # Read the Excel file
                with pd.ExcelFile(file_path) as excel_data:
                    cleaned_data = {}

                    # Process each sheet
                    for sheet_name in excel_data.sheet_names:
                        df = pd.read_excel(file_path, sheet_name=sheet_name)

                        if df.empty:
                            print(f"‚ö†Ô∏è Skipping empty sheet: {sheet_name}")
                            continue  # Skip empty sheets

                        initial_rows = len(df)

                        if df.shape[1] > 1:  # If more than one column exists
                            df_no_last_col = df.iloc[:, :-1]  # Exclude the last column
                            df = df.loc[~df_no_last_col.duplicated()]  # Remove duplicates based on all but last column
                        else:
                            df.drop_duplicates(inplace=True)  # If only one column, remove exact duplicates

                        final_rows = len(df)

                        if initial_rows > final_rows:
                            print(f"‚úÖ Removed {initial_rows - final_rows} duplicates from sheet: {sheet_name}")

                        cleaned_data[sheet_name] = df

                    # Save cleaned data back to the same file
                    with pd.ExcelWriter(file_path, engine="openpyxl", mode="w") as writer:
                        for sheet, data in cleaned_data.items():
                            data.to_excel(writer, index=False, sheet_name=sheet)

                print(f"‚úî Duplicates removed and saved: {file_path}\n")

            except Exception as e:
                print(f"‚ùå Error processing {file_path}: {e}")

    print("üéâ Duplicate removal process completed for all last-level folders.")

# Define main folder path
main_folder_path = 'C:/Automation/output/'  # Change this to your folder path

# Run the function
remove_duplicates_from_last_level_folders(main_folder_path)