In [2]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from openpyxl import load_workbook, Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font

def time_complexity(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Function '{func.__name__}' took {elapsed_time:.2f} seconds to execute.")
        return result
    return wrapper

column_mapping = {
    "fulfillment_id": "orderId",
    "value": "Price",
    "optimized_cartons": "Carton_Count",
    "total_volume": "Carton_volume",
    "net_volume": "Order_volume",
    "items_total": "items_total",
    "items_leftover": "items_leftover",
    "volume_utilization": "volume_utilization",
    "total_weight": "total_weight",
    "box_counts": "box_counts",
    "surface_area": "surface_area",
    "weight_utilization": "weight_utilization",
    "net_weight": "net_weight",
    "tare_weight": "tare_weight",
    "dim_weight": "dim_weight"
}

# Columns for consolidated output
consolidated_columns = [
    "orderId", "Price", "Carton_Count", "Carton_volume", "Order_volume",
    "items_total", "items_leftover","volume_utilization", "total_weight", 
    "box_counts", "surface_area", "weight_utilization", "net_weight",
    "tare_weight", "dim_weight", "source_flag"
]

combined_columns = [
    "orderId_baseline", "orderId", 
    "Price_baseline", "Price", "Price_Diff",
    "Carton_Count_baseline", "Carton_Count", 
    "Carton_volume_baseline", "Carton_volume", "Carton_Volume_Diff",
    "Order_volume_baseline", "Order_volume", "Order_volume_Diff",
    "items_total_baseline", "items_total", 
    "items_leftover_baseline", "items_leftover","items_leftover_Diff",
    "volume_utilization_baseline", "volume_utilization", "volume_utilization_Diff", 
    "total_weight_baseline", "total_weight", "total_weight_Diff",
    "box_counts_baseline", "box_counts", "box_counts_Diff",
    "surface_area_baseline", "surface_area", 
    "weight_utilization_baseline", "weight_utilization",
    "net_weight_baseline", "net_weight", "tare_weight_baseline", "tare_weight",
    "dim_weight_baseline", "dim_weight"
]

columns_to_drop = [
    "all_packed", "dim_rated", "zone", "original_price",
    "optimized_price", "box_summary", "total_time", "item_sets", "pack_request", "pack_response"
]

# Refined suffix extraction: After "pacsimulate_####_", capture the rest of the string
def refine_suffix(filename):
    """Extracts suffix between '_2441.' and next period/end"""
    match = re.search(r"pacsimulate_(\d+)\.(.*?)(?:$|\.)", filename)
    if match:
        return match.group(2)
    return None

@time_complexity
# Loads a file in chunks, processes it, and returns a DataFrame
def load_and_process_file_in_chunks(file_path, chunk_size=100000):
    try:
        chunks = []
        for chunk in pd.read_csv(
            file_path, delimiter='|', low_memory=False, memory_map=True,
            on_bad_lines='skip', chunksize=chunk_size
        ):
            # Drop unnecessary columns
            if columns_to_drop:
                chunk = chunk.drop(columns=columns_to_drop, errors='ignore')

            # Rename columns based on mapping
            chunk = chunk.rename(columns={col: column_mapping[col] for col in column_mapping if col in chunk.columns})

            # Extract the source_flag using the filename suffix
            suffix = refine_suffix(file_path)
            if suffix:
                chunk["source_flag"] = suffix
            else:
                print(f"Warning: No valid suffix in file {file_path}.")

            # Reindex to ensure all required columns are present
            chunk = chunk.reindex(columns=consolidated_columns, fill_value=None)

            chunks.append(chunk)

        return pd.concat(chunks, ignore_index=True)
    except Exception as e:
        print(f"Failed to load file {file_path}: {e}")
        return pd.DataFrame()

def calculate_consolidated_fields(df):
    # Calculate Dimmed
    df['Dimmed'] = np.where(df['dim_weight'] > df['total_weight'], 'Yes', 'No')

    # Calculate Billed Weight
    df['Billed_Weight'] = np.where(df['dim_weight'] > df['total_weight'], np.ceil(df['dim_weight']), np.ceil(df['total_weight'])).astype(int)
    
    # Billed Over Actual
    df['Billed_over_Actual'] = np.where(df['Billed_Weight'] > df['total_weight'], 1, 0)
    

    return df

@time_complexity
def compute_differences(df):
    try:
        # Define the column pairs for differences
        column_pairs = [
            ("Price_baseline", "Price", "Price_Diff"),
            ("Order_volume_baseline", "Order_volume", "Order_Volume_Diff"),
            ("Carton_volume_baseline", "Carton_volume", "Carton_Volume_Diff"),
            ("volume_utilization_baseline", "volume_utilization", "volume_utilization_Diff"),
            ("total_weight_baseline", "total_weight", "total_weight_Diff"),
            ("items_leftover_baseline", "items_leftover","items_leftover_Diff"),  
            ("box_counts_baseline", "box_counts", "box_counts_Diff"),
        ]

        for col_baseline, col, col_diff in tqdm(
            column_pairs, desc="Computing Differences", unit="file", colour="green"
        ):
            if col_baseline in df.columns and col in df.columns:
                df[col_baseline] = pd.to_numeric(df[col_baseline], errors="coerce").fillna(0)
                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
                df[col_diff] = df[col] - df[col_baseline]
            else:
                print(f"Skipping difference calculation for {col_diff}: Missing {col_baseline} or {col}")

        return df
    except Exception as e:
        print(f"Error in compute_differences: {e}")
        return df

@time_complexity
def process_files(directory, output_directory):
    files = os.listdir(directory)
    data_frames = {}
    consolidated_data = []
    baseline_file = None

    # Identify the baseline file
    for filename in files:
        if "baseline" in filename.lower():
            baseline_file = filename
            break

    if not baseline_file:
        raise ValueError("Baseline file not found. Ensure a file containing 'baseline' in its name exists.")

    # Process files
    with tqdm(total=len(files), desc="Loading files", unit="file", colour="blue") as pbar:
        for filename in files:
            file_path = os.path.join(directory, filename)
            df = load_and_process_file_in_chunks(file_path, chunk_size=100000)
            if not df.empty:
                data_frames[filename] = df
                consolidated_data.append(df)
            else:
                print(f"Warning: No valid data in file {filename}")
            pbar.update(1)

    # Consolidate all data into a single DataFrame
    if consolidated_data:
        consolidated_output = pd.concat(consolidated_data, ignore_index=True)
        try:
            # Perform any calculations 
            consolidated_output = calculate_consolidated_fields(consolidated_output)
            
            # Save the consolidated output
            consolidated_output_path = os.path.join(output_directory, "consolidated_output.xlsx")
            with pd.ExcelWriter(consolidated_output_path, engine='openpyxl') as writer:
                consolidated_output.to_excel(writer, sheet_name="Consolidated_Output", index=False)

                workbook = writer.book
                sheet = writer.sheets["Consolidated_Output"]
                # Header formating 
                for cell in sheet[1]:
                    cell.font = Font(bold=False)
                # column formatting
                for col in sheet.columns:
                    column_letter = col[0].column_letter
                    # Get the maximum length of the column header and its values
                    header_length = len(str(col[0].value)) if col[0].value else 0
                    # Calculate the column width
                    column_width = max(header_length + 0.5, 13)
                    # Set the column width
                    sheet.column_dimensions[column_letter].width = column_width

            print(f"Consolidated output saved at: {consolidated_output_path}")

        except Exception as e:
            print(f"Error during consolidated calculations or saving: {e}")
    else:
        print("No data to consolidate.")

    # Extract the run # from the baseline file name
    run_match = re.search(r"pacsimulate_(\d+)", baseline_file)
    run_number = run_match.group(1)
    
    # Define the new output file name
    comparison_filename = f"Summary_Output_run_{run_number}.xlsx"
    comparison_path = os.path.join(output_directory, comparison_filename)
    
    # Save comparison outputs to Excel
    baseline_df = data_frames.pop(baseline_file)
    with pd.ExcelWriter(comparison_path, engine='openpyxl') as writer:
        for key, df in data_frames.items():
            sheet_name = refine_suffix(key)  # Using the refined filename as sheet name
            if not sheet_name:
                print(f"Skipping sheet for file {key} due to invalid suffix.")
                continue  # Skip empty suffix
            
            # Align comparison DataFrame columns with combined_columns
            comparison_df = df.reindex(columns=[col.replace("_baseline", "") for col in combined_columns if "_baseline" not in col])

            # Combine baseline and comparison data
            combined_df = pd.concat(
                [baseline_df.add_suffix("_baseline"), comparison_df],
                axis=1
            ).reindex(columns=combined_columns)

            # Compute differences
            combined_df = compute_differences(combined_df)

            # Keep the difference columns
            difference_columns = [
                "Price_Diff", "Order_Volume_Diff", "Carton_Volume_Diff",
                "volume_utilization_Diff", "total_weight_Diff", 
                "items_leftover_Diff", "box_counts_Diff"
            ]

            if combined_df.empty:
                print(f"Warning: Sheet {sheet_name} has no data. Skipping.")
                continue

            # Write the combined DataFrame to the corresponding sheet
            combined_df.to_excel(writer, sheet_name=sheet_name, index=False)

            # Apply formulas for Dimmed and Billed Weight
            workbook = writer.book
            sheet = writer.sheets[sheet_name]

            headers = [str(cell.value).strip() for cell in sheet[1]]
            dim_weight_col_idx = next((i + 1 for i, h in enumerate(headers) if h.startswith("dim_weight_")), None)
            total_weight_col_idx = next((i + 1 for i, h in enumerate(headers) if h.startswith("total_weight_")), None)

            if dim_weight_col_idx is None or total_weight_col_idx is None:
                print(f"Warning: Required columns for Dimmed and Billed Weight formulas not found in '{sheet_name}'. Headers: {headers}")
                continue

            last_column = len(headers)
            dimmed_col_letter = get_column_letter(last_column + 1)
            billed_weight_col_letter = get_column_letter(last_column + 2)

            sheet[f"{dimmed_col_letter}1"] = "Dimmed"
            sheet[f"{billed_weight_col_letter}1"] = "Billed_Weight"

            for row in range(2, sheet.max_row + 1):
                sheet[f"{dimmed_col_letter}{row}"] = (
                    f"=IF({get_column_letter(dim_weight_col_idx)}{row} > {get_column_letter(total_weight_col_idx)}{row}, \"Yes\", \"No\")"
                )
                sheet[f"{billed_weight_col_letter}{row}"] = (
                    f"=IF({get_column_letter(dim_weight_col_idx)}{row} > {get_column_letter(total_weight_col_idx)}{row}, "
                    f"ROUNDUP({get_column_letter(dim_weight_col_idx)}{row}, 0), ROUNDUP({get_column_letter(total_weight_col_idx)}{row}, 0))"
                )

            final_df = combined_df[difference_columns]
            # Header formating 
            for cell in sheet[1]:
                cell.font = Font(bold=False)
            # column formatting
            for col in sheet.columns:
                max_length = 0
                column_letter = col[0].column_letter  

                for col in sheet.columns:
                    column_letter = col[0].column_letter
                   #  Get the maximum length of the column header and its values
                    header_length = len(str(col[0].value)) if col[0].value else 0
                    # Calculate the column width
                    column_width = max(header_length + 0.5, 13)
                    # Set the column width
                    sheet.column_dimensions[column_letter].width = column_width

    print(f"Comparison Excel saved at {comparison_path}")

if __name__ == "__main__":
    directory = r"Upload Path"
    output_directory = r"Output Path"
    os.makedirs(output_directory, exist_ok=True)
    process_files(directory, output_directory)

Loading files:  17%|[34m███████████▏                                                       [0m| 1/6 [00:02<00:11,  2.36s/file][0m

Function 'load_and_process_file_in_chunks' took 2.36 seconds to execute.


Loading files:  33%|[34m██████████████████████▎                                            [0m| 2/6 [00:04<00:09,  2.30s/file][0m

Function 'load_and_process_file_in_chunks' took 2.26 seconds to execute.


Loading files:  50%|[34m█████████████████████████████████▌                                 [0m| 3/6 [00:06<00:06,  2.29s/file][0m

Function 'load_and_process_file_in_chunks' took 2.28 seconds to execute.


Loading files:  67%|[34m████████████████████████████████████████████▋                      [0m| 4/6 [00:09<00:04,  2.27s/file][0m

Function 'load_and_process_file_in_chunks' took 2.22 seconds to execute.


Loading files:  83%|[34m███████████████████████████████████████████████████████▊           [0m| 5/6 [00:11<00:02,  2.22s/file][0m

Function 'load_and_process_file_in_chunks' took 2.15 seconds to execute.


Loading files: 100%|[34m███████████████████████████████████████████████████████████████████[0m| 6/6 [00:13<00:00,  2.24s/file][0m

Function 'load_and_process_file_in_chunks' took 2.17 seconds to execute.





Consolidated output saved at: Output Path\consolidated_output.xlsx


Computing Differences: 100%|[32m███████████████████████████████████████████████████████████[0m| 7/7 [00:00<00:00, 24.43file/s][0m


Function 'compute_differences' took 0.29 seconds to execute.


Computing Differences: 100%|[32m███████████████████████████████████████████████████████████[0m| 7/7 [00:00<00:00, 46.82file/s][0m


Function 'compute_differences' took 0.15 seconds to execute.


Computing Differences: 100%|[32m███████████████████████████████████████████████████████████[0m| 7/7 [00:00<00:00, 40.81file/s][0m


Function 'compute_differences' took 0.17 seconds to execute.


Computing Differences: 100%|[32m███████████████████████████████████████████████████████████[0m| 7/7 [00:00<00:00, 38.88file/s][0m


Function 'compute_differences' took 0.18 seconds to execute.


Computing Differences: 100%|[32m███████████████████████████████████████████████████████████[0m| 7/7 [00:00<00:00, 41.17file/s][0m


Function 'compute_differences' took 0.17 seconds to execute.
Comparison Excel saved at Output Path\Summary_Output_run_2441.xlsx
Function 'process_files' took 1271.31 seconds to execute.
