In [None]:
!pip install pandas tqdm openpyxl

In [5]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from openpyxl import load_workbook, Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font

# Decorator to measure execution time
def time_complexity(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function '{func.__name__}' took {end_time - start_time:.2f} seconds to execute.")
        return result
    return wrapper

# Column mappings
column_mapping = {
    "fulfillment_id": "orderId",
    "ref_id": "refId",
    "index": "index",
    "name": "name",
    "dimensions": "dimensions",
    "cost": "Price",
    "base_cost": "base_cost",
    "total_volume": "Carton_volume",
    "net_volume": "Order_volume",
    "volume_utilization": "volume_utilization",
    "surface_area": "surface_area",
    "total_weight": "total_weight",
    "net_weight": "net_weight",
    "tare_weight": "tare_weight",
    "weight_utilization": "weight_utilization",
    "item_count": "item_count",
    "dim_weight": "dim_weight",
}

# Columns for consolidated output
consolidated_columns = [
    "orderId", "refId", "index", "name", "dimensions", "Price", "base_cost",
    "Carton_volume", "Order_volume", "volume_utilization", "surface_area",
    "total_weight", "net_weight", "tare_weight", "weight_utilization",
    "dim_weight", "item_count", "source_flag"
]

# Combined sheet column order
combined_columns = [
    "orderId_baseline", "orderId", "refId_baseline", "refId",
    "index_baseline", "index", "name_baseline", "name",
    "dimensions_baseline", "dimensions", "Price_baseline", "Price", "Price_Diff",
    "base_cost_baseline", "base_cost", "Carton_volume_baseline", "Carton_volume", "Carton_Volume_Diff",
    "Order_volume_baseline", "Order_volume", "Order_Volume_Diff", "volume_utilization_baseline", "volume_utilization", 
    "volume_utilization_Diff", "surface_area_baseline", "surface_area", "total_weight_baseline", "total_weight", "total_weight_Diff",
    "net_weight_baseline", "net_weight", "tare_weight_baseline", "tare_weight",
    "weight_utilization_baseline", "weight_utilization", "dim_weight_baseline", "dim_weight",
    "item_count_baseline", "item_count", "Item_Diff"
]

# Columns to drop to save memory (only if truly necessary)
columns_to_drop = ["item_summary"]  # Ensure this is correct; remove if you need it

def refine_suffix(filename):
    # Remove the file extension (e.g., .baseline, .opc_top-14)
    name_without_extension = os.path.splitext(filename)[0]
    
    # Match and extract only the part after 'pacsimulate_'
    match = re.search(r"pacsimulate_(_+)", name_without_extension)
    if match:
        return match.group(1)  # Return the extracted portion
    
    # Fallback
    return name_without_extension.replace(".", "_").replace("-", "_")

def clean_sheet_name(name):
    """Cleans up sheet names to ensure they are Excel-compatible."""
    cleaned_name = name.replace(" ", "_").replace("-", "_").replace(".", "_")
    return cleaned_name[:31]

@time_complexity
def load_and_process_file_in_chunks(file_path, suffix, chunk_size=100000):
    """Loads a file in chunks, processes it, and returns a DataFrame."""
    try:
        chunks = []
        for chunk in pd.read_csv(
            file_path, delimiter='|', low_memory=False, memory_map=True,
            on_bad_lines='skip', chunksize=chunk_size
        ):
            # Drop unnecessary columns
            if columns_to_drop:
                chunk = chunk.drop(columns=columns_to_drop, errors='ignore')
            
            # Rename columns based on mapping
            chunk = chunk.rename(columns={col: column_mapping[col] for col in column_mapping if col in chunk.columns})
            
            # Add source_flag
            chunk["source_flag"] = suffix
            
            # Reindex to ensure all required columns are present
            chunk = chunk.reindex(columns=consolidated_columns, fill_value=None)
            
            chunks.append(chunk)
        return pd.concat(chunks, ignore_index=True)
    except Exception as e:
        print(f"Failed to load file {file_path}: {e}")
        return pd.DataFrame()

def calculate_consolidated_fields(df):

     # Calculate Dimmed
    df['Dimmed'] = np.where(df['dim_weight'] > df['total_weight'], 'Yes', 'No')

    # Calculate Billed Weight
    df['Billed_Weight'] = np.where(df['dim_weight'] > df['total_weight'], np.ceil(df['dim_weight']), np.ceil(df['total_weight'])).astype(int)
    
    # Split dimensions into L, W, H
    dimensions_split = df['dimensions'].str.split(',', expand=True)
    
    # Validate that the split resulted in exactly three parts
    if dimensions_split.shape[1] != 3:
        print("Warning: 'dimensions' column does not split into exactly three parts (L,W,H). Filling with NaN.")
        dimensions_split = dimensions_split.reindex(columns=[0,1,2], fill_value=np.nan)

    # Assign to new columns
    df['L'] = pd.to_numeric(dimensions_split[0].str.strip(), errors='coerce')
    df['W'] = pd.to_numeric(dimensions_split[1].str.strip(), errors='coerce')
    df['H'] = pd.to_numeric(dimensions_split[2].str.strip(), errors='coerce')

    # Calculate Surface Area (SA)
    df['SA'] = 2 * (df['L'] * df['W'] + df['L'] * df['H'] + df['W'] * df['H']) + 2 * (df['W'] ** 2)

    return df

@time_complexity
def compute_differences(df):
    try:
        # Define the column pairs for differences
        column_pairs = [
            ("Price_baseline", "Price", "Price_Diff"),
            ("Order_volume_baseline", "Order_volume", "Order_Volume_Diff"),
            ("item_count_baseline", "item_count", "Item_Diff"),
            ("Carton_volume_baseline", "Carton_volume", "Carton_Volume_Diff"),
            ("volume_utilization_baseline", "volume_utilization", "volume_utilization_Diff"),
            ("total_weight_baseline", "total_weight", "total_weight_Diff"),
        ]

        for col_baseline, col, col_diff in tqdm(
            column_pairs, desc="Computing Differences", unit="file", colour="green"
        ):
            if col_baseline in df.columns and col in df.columns:
                df[col_baseline] = pd.to_numeric(df[col_baseline], errors="coerce").fillna(0)
                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
                df[col_diff] = df[col] - df[col_baseline]
            else:
                print(f"Skipping difference calculation for {col_diff}: Missing {col_baseline} or {col}")

        return df
    except Exception as e:
        print(f"Error in compute_differences: {e}")
        return df

@time_complexity
def process_files(directory, output_directory):
    """Processes all files in a directory and creates consolidated and comparison outputs."""
    files = os.listdir(directory)
    data_frames = {}
    consolidated_data = []
    baseline_file = None

    # Identify the baseline file
    for filename in files:
        if "baseline" in filename.lower():
            baseline_file = filename
            break

    if not baseline_file:
        raise ValueError("Baseline file not found. Ensure a file containing 'baseline' in its name exists.")

    # Process each file
    with tqdm(total=len(files), desc="Loading files", unit="file", colour="blue") as pbar:
        for filename in files:
            file_path = os.path.join(directory, filename)
            file_suffix = refine_suffix(filename)
            sheet_name = clean_sheet_name(file_suffix)
            df = load_and_process_file_in_chunks(file_path, file_suffix, chunk_size=100000)
            if not df.empty:
                data_frames[file_suffix] = df  # Store for later use
                consolidated_data.append(df)    # Append processed data
            else:
                print(f"Warning: No valid data in file {filename}")
            pbar.update(1)

    # Consolidate data into a single DataFrame
    if consolidated_data:
        consolidated_output = pd.concat(consolidated_data, ignore_index=True)
    else:
        print("No data to consolidate.")
        consolidated_output = pd.DataFrame(columns=consolidated_columns)
    try:
        consolidated_output = calculate_consolidated_fields(consolidated_output)
    except Exception as e:
        print(f"Error during consolidated calculations: {e}")

    # Extract the run# from the baseline file name
    run_match = re.search(r"pacsimulate_(\d+)", baseline_file)
    run_number = run_match.group(1)
    
    # Define the new output file name
    comparison_filename = f"Carton_Output_run_{run_number}.xlsx"
    comparison_path = os.path.join(output_directory, comparison_filename)
    
    # Save comparison outputs to Excel
    baseline_df = data_frames.pop(refine_suffix(baseline_file))
    with pd.ExcelWriter(comparison_path, engine='openpyxl') as writer:
        for key, df in data_frames.items():
            sheet_name = clean_sheet_name(key)
    
            # Align comparison DataFrame columns with combined_columns
            comparison_df = df.reindex(columns=[col.replace("_baseline", "") for col in combined_columns if "_baseline" not in col])
    
            # Combine baseline and comparison data
            combined_df = pd.concat(
                [baseline_df.add_suffix("_baseline"), comparison_df],
                axis=1
            ).reindex(columns=combined_columns)
    
            # Compute differences
            combined_df = compute_differences(combined_df)
    
            # Keep only the difference columns
            difference_columns = [
                "Price_Diff", "Order_Volume_Diff", "Item_Diff", 
                "Carton_Volume_Diff", "volume_utilization_Diff", "total_weight_Diff"
            ]
    
            # Write the combined DataFrame to the corresponding sheet
            combined_df.to_excel(writer, sheet_name=sheet_name, index=False)
    
            # Apply formulas for Dimmed and Billed Weight
            workbook = writer.book
            sheet = writer.sheets[sheet_name]
    
            headers = [str(cell.value).strip() for cell in sheet[1]]
            dim_weight_col_idx = next((i + 1 for i, h in enumerate(headers) if h.startswith("dim_weight_")), None)
            total_weight_col_idx = next((i + 1 for i, h in enumerate(headers) if h.startswith("total_weight_")), None)
    
            if dim_weight_col_idx is None or total_weight_col_idx is None:
                print(f"Warning: Required columns for Dimmed and Billed Weight formulas not found in '{sheet_name}'. Headers: {headers}")
                continue
    
            last_column = len(headers)
            dimmed_col_letter = get_column_letter(last_column + 1)
            billed_weight_col_letter = get_column_letter(last_column + 2)
    
            sheet[f"{dimmed_col_letter}1"] = "Dimmed"
            sheet[f"{billed_weight_col_letter}1"] = "Billed_Weight"
    
            for row in range(2, sheet.max_row + 1):
                sheet[f"{dimmed_col_letter}{row}"] = (
                    f"=IF({get_column_letter(dim_weight_col_idx)}{row} > {get_column_letter(total_weight_col_idx)}{row}, \"Yes\", \"No\")"
                )
                sheet[f"{billed_weight_col_letter}{row}"] = (
                    f"=IF({get_column_letter(dim_weight_col_idx)}{row} > {get_column_letter(total_weight_col_idx)}{row}, "
                    f"ROUNDUP({get_column_letter(dim_weight_col_idx)}{row}, 0), ROUNDUP({get_column_letter(total_weight_col_idx)}{row}, 0))"
                )
            
            final_df = combined_df[difference_columns]
            # Non-bold headers
            for cell in sheet[1]:
                cell.font = Font(bold=False)
            
            for col in sheet.columns:
                max_length = 0
                column_letter = col[0].column_letter  # Get the column letter (e.g., 'A', 'B')
            
                for cell in col:
                    if cell.value:
                        cell_length = len(str(cell.value))
                        if cell_length > max_length:
                            max_length = cell_length
            
                adjusted_width = max_length + 0.5  # Adding padding
                sheet.column_dimensions[column_letter].width = adjusted_width

    print(f"Comparison Excel saved at {comparison_path}")

if __name__ == "__main__":
    directory = r"Upload Path"
    output_directory = r"Output Path"
    os.makedirs(output_directory, exist_ok=True)
    process_files(directory, output_directory)

Loading files:  17%|[34m███████████▏                                                       [0m| 1/6 [00:00<00:03,  1.63file/s][0m

Function 'load_and_process_file_in_chunks' took 0.61 seconds to execute.


Loading files:  33%|[34m██████████████████████▎                                            [0m| 2/6 [00:01<00:02,  1.58file/s][0m

Function 'load_and_process_file_in_chunks' took 0.65 seconds to execute.


Loading files:  50%|[34m█████████████████████████████████▌                                 [0m| 3/6 [00:01<00:01,  1.58file/s][0m

Function 'load_and_process_file_in_chunks' took 0.63 seconds to execute.


Loading files:  67%|[34m████████████████████████████████████████████▋                      [0m| 4/6 [00:02<00:01,  1.43file/s][0m

Function 'load_and_process_file_in_chunks' took 0.80 seconds to execute.


Loading files:  83%|[34m███████████████████████████████████████████████████████▊           [0m| 5/6 [00:03<00:00,  1.43file/s][0m

Function 'load_and_process_file_in_chunks' took 0.69 seconds to execute.


Loading files: 100%|[34m███████████████████████████████████████████████████████████████████[0m| 6/6 [00:04<00:00,  1.47file/s][0m

Function 'load_and_process_file_in_chunks' took 0.69 seconds to execute.



Computing Differences: 100%|[32m██████████████████████████████████████████████████████████[0m| 6/6 [00:00<00:00, 433.18file/s][0m


Function 'compute_differences' took 0.02 seconds to execute.


Computing Differences: 100%|[32m██████████████████████████████████████████████████████████[0m| 6/6 [00:00<00:00, 386.42file/s][0m


Function 'compute_differences' took 0.02 seconds to execute.


Computing Differences: 100%|[32m██████████████████████████████████████████████████████████[0m| 6/6 [00:00<00:00, 461.55file/s][0m


Function 'compute_differences' took 0.02 seconds to execute.


Computing Differences: 100%|[32m██████████████████████████████████████████████████████████[0m| 6/6 [00:00<00:00, 600.10file/s][0m


Function 'compute_differences' took 0.01 seconds to execute.


Computing Differences: 100%|[32m██████████████████████████████████████████████████████████[0m| 6/6 [00:00<00:00, 326.88file/s][0m


Function 'compute_differences' took 0.02 seconds to execute.
Comparison Excel saved at Output Path\Carton_Output_run_2441.xlsx
Function 'process_files' took 502.59 seconds to execute.
