In [None]:
!pip install pandas tqdm openpyxl

In [None]:
import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from openpyxl import load_workbook, Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font
from openpyxl.styles import NamedStyle

# Decorator to measure execution time
def time_complexity(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Function '{func.__name__}' took {end_time - start_time:.2f} seconds to execute.")
        return result
    return wrapper

# Column rename mapping
column_mapping = {
    "fulfillment_id": "orderId",
    "ref_id": "refId",
    "index": "index",
    "name": "name",
    "dimensions": "dimensions",
    "cost": "Price",
    "base_cost": "base_cost",
    "total_volume": "Carton_volume",
    "net_volume": "Order_volume",
    "volume_utilization": "volume_utilization",
    "surface_area": "surface_area",
    "total_weight": "total_weight",
    "net_weight": "net_weight",
    "tare_weight": "tare_weight",
    "weight_utilization": "weight_utilization",
    "item_count": "item_count",
    "dim_weight": "dim_weight",
}

# Columns for consolidated output
consolidated_columns = [
    "orderId", "refId", "index", "name", "dimensions", "Price", "base_cost",
    "Carton_volume", "Order_volume", "volume_utilization", "surface_area",
    "total_weight", "net_weight", "tare_weight", "weight_utilization",
    "dim_weight", "item_count", "source_flag"
]

# Combined sheet column order
combined_columns = [
    "orderId_baseline", "orderId", "refId_baseline", "refId",
    "index_baseline", "index", "name_baseline", "name",
    "dimensions_baseline", "dimensions", "Price_baseline", "Price", "Price_Diff",
    "base_cost_baseline", "base_cost", "Carton_volume_baseline", "Carton_volume", "Carton_volume_Diff",
    "Order_volume_baseline", "Order_volume", "Order_volume_Diff", "volume_utilization_baseline", "volume_utilization", 
    "volume_utilization_Diff", "surface_area_baseline", "surface_area", "total_weight_baseline", "total_weight", "total_weight_Diff",
    "net_weight_baseline", "net_weight", "tare_weight_baseline", "tare_weight",
    "weight_utilization_baseline", "weight_utilization", "dim_weight_baseline", "dim_weight",
    "item_count_baseline", "item_count", "Item_Diff"
]

 # Columns to drop to save memory
columns_to_drop = ["item_summary"]  

# Refined suffix extraction: After "pacsimulate_####_", capture the rest of the string
def refine_suffix(filename):
    """Extracts suffix between '_2441.' and next period/end"""
    match = re.search(r"pacsimulate_(\d+)\.(.*?)(?:$|\.)", filename)
    if match:
        return match.group(2)
    return None

@time_complexity
# Loads a file in chunks, processes it, and returns a DataFrame
def load_and_process_file_in_chunks(file_path, chunk_size=100000):
    try:
        chunks = []
        for chunk in pd.read_csv(
            file_path, delimiter='|', low_memory=False, memory_map=True,
            on_bad_lines='skip', chunksize=chunk_size
        ):
            # Drop unnecessary columns
            if columns_to_drop:
                chunk = chunk.drop(columns=columns_to_drop, errors='ignore')

            # Rename columns based on mapping
            chunk = chunk.rename(columns={col: column_mapping[col] for col in column_mapping if col in chunk.columns})

            # Extract the source_flag using the filename suffix
            suffix = refine_suffix(file_path)
            if suffix:
                chunk["source_flag"] = suffix
            else:
                print(f"Warning: No valid suffix in file {file_path}.")

            # Reindex to ensure all required columns are present
            chunk = chunk.reindex(columns=consolidated_columns, fill_value=None)

            chunks.append(chunk)

        return pd.concat(chunks, ignore_index=True)
    except Exception as e:
        print(f"Failed to load file {file_path}: {e}")
        return pd.DataFrame()

def calculate_consolidated_fields(df):
    # Calculate Dimmed
    df['Dimmed'] = np.where(df['dim_weight'] > df['total_weight'], 'Yes', 'No')

    # Calculate Billed Weight
    df['Billed_Weight'] = np.where(df['dim_weight'] > df['total_weight'], np.ceil(df['dim_weight']), np.ceil(df['total_weight'])).astype(int)
    
    # Billed Over Actual
    df['total_weight'] = np.ceil(df['total_weight'])

    # Then calculate the 'Billed_over_Actual' column
    df['Billed_over_Actual'] = np.where(df['Billed_Weight'] - df['total_weight'] > 0, df['Billed_Weight'] - df['total_weight'], 0)

    # Split dimensions into L, W, H
    dimensions_split = df['dimensions'].str.split(',', expand=True)
    
    # Validate that the split resulted in exactly three parts
    if dimensions_split.shape[1] != 3:
        print("Warning: 'dimensions' column does not split into exactly three parts (L,W,H). Filling with NaN.")
        dimensions_split = dimensions_split.reindex(columns=[0,1,2], fill_value=np.nan)

    # Assign to new columns
    df['L'] = pd.to_numeric(dimensions_split[0].str.strip(), errors='coerce')
    df['W'] = pd.to_numeric(dimensions_split[1].str.strip(), errors='coerce')
    df['H'] = pd.to_numeric(dimensions_split[2].str.strip(), errors='coerce')

    # Calculate Surface Area (SA)
    #df['SA'] = 2 * (df['L'] * df['W'] + df['L'] * df['H'] + df['W'] * df['H']) + 2 * (df['W'] ** 2)

    return df

@time_complexity
def compute_differences(df):
    try:
        # Define the column pairs for differences
        column_pairs = [
            ("Price_baseline", "Price", "Price_Diff"),
            ("Order_volume_baseline", "Order_volume", "Order_volume_Diff"),
            ("item_count_baseline", "item_count", "Item_Diff"),
            ("Carton_volume_baseline", "Carton_volume", "Carton_volume_Diff"),
            ("volume_utilization_baseline", "volume_utilization", "volume_utilization_Diff"),
            ("total_weight_baseline", "total_weight", "total_weight_Diff"),
        ]

        for col_baseline, col, col_diff in tqdm(
            column_pairs, desc="Computing Differences", unit="file", colour="green"
        ):
            if col_baseline in df.columns and col in df.columns:
                df[col_baseline] = pd.to_numeric(df[col_baseline], errors="coerce").fillna(0)
                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
                df[col_diff] = df[col] - df[col_baseline]
            else:
                print(f"Skipping difference calculation for {col_diff}: Missing {col_baseline} or {col}")

        return df
    except Exception as e:
        print(f"Error in compute_differences: {e}")
        return df

def apply_dynamic_column_formats(sheet):
    
    number_style = NamedStyle(name="number")
    number_style.number_format = '0.00'  
    
    percentage_style = NamedStyle(name="percentage")
    percentage_style.number_format = '0.00%' 

    numeric_columns = [
        "Price_baseline", "Price", "Price_Diff", 
        "Carton_volume_baseline", "Carton_volume", "Carton_volume_Diff", 
        "Order_volume_baseline", "Order_volume", "Order_volume_Diff", 
        "surface_area_baseline", "surface_area", 
        "dim_weight_baseline", "dim_weight"
    ]
    utilization_columns = [
        "volume_utilization_baseline", "volume_utilization", "volume_utilization_Diff", 
        "weight_utilization_baseline", "weight_utilization"
    ]

    # Loop through columns and apply styles based on column name
    for col in sheet.columns:
        column_letter = col[0].column_letter
        column_name = str(col[0].value).strip() if col[0].value else ""

        # Apply number format to numeric columns
        if column_name in numeric_columns:
            for cell in col:
                cell.number_format = '0.00'  
        
        elif column_name in utilization_columns:
            for cell in col:
                cell.number_format = '0.00%'  

@time_complexity
def process_files(directory, output_directory):
    files = os.listdir(directory)
    data_frames = {}
    consolidated_data = []
    baseline_file = None

    # Identify the baseline file
    for filename in files:
        if "baseline" in filename.lower():
            baseline_file = filename
            break

    if not baseline_file:
        raise ValueError("Baseline file not found. Ensure a file containing 'baseline' in its name exists.")

    # Process files
    with tqdm(total=len(files), desc="Loading files", unit="file", colour="blue") as pbar:
        for filename in files:
            file_path = os.path.join(directory, filename)
            df = load_and_process_file_in_chunks(file_path, chunk_size=100000)
            if not df.empty:
                data_frames[filename] = df
                consolidated_data.append(df)
            else:
                print(f"Warning: No valid data in file {filename}")
            pbar.update(n=1)
            


    # Consolidate all data into a single DataFrame
    if consolidated_data:
        consolidated_output = pd.concat(consolidated_data, ignore_index=True)
        try:
            # Perform any calculations 
            consolidated_output = calculate_consolidated_fields(consolidated_output)
            
            # Save the consolidated output
            consolidated_output_path = os.path.join(output_directory, "consolidated_output.xlsx")
            with pd.ExcelWriter(consolidated_output_path, engine='openpyxl') as writer:
                consolidated_output.to_excel(writer, sheet_name="Consolidated_Output", index=False)

                workbook = writer.book
                sheet = writer.sheets["Consolidated_Output"]
                
                
                # Header formating 
                for cell in sheet[1]:
                    cell.font = Font(bold=False)
                # column formatting
                for col in sheet.columns:
                    column_letter = col[0].column_letter
                    # Get the maximum length of the column header and its values
                    header_length = len(str(col[0].value)) if col[0].value else 0
                    # Calculate the column width
                    column_width = max(header_length + 0.5, 13)
                    # Set the column width
                    sheet.column_dimensions[column_letter].width = column_width

            print(f"Consolidated output saved at: {consolidated_output_path}")

        except Exception as e:
            print(f"Error during consolidated calculations or saving: {e}")
    else:
        print("No data to consolidate.")

    # Define the new output file name
    comparison_filename = "combined_output.xlsx"
    comparison_path = os.path.join(output_directory, comparison_filename)
    
    # Save comparison outputs to Excel
    baseline_df = data_frames.pop(baseline_file)
    with pd.ExcelWriter(comparison_path, engine='openpyxl') as writer:
        for key, df in data_frames.items():
            sheet_name = refine_suffix(key)  # Using the refined filename as sheet name
            if not sheet_name:
                print(f"Skipping sheet for file {key} due to invalid suffix.")
                continue  # Skip empty suffix
            
            # Align comparison DataFrame columns with combined_columns
            comparison_df = df.reindex(columns=[col.replace("_baseline", "") for col in combined_columns if "_baseline" not in col])

            # Combine baseline and comparison data
            combined_df = pd.concat(
                [baseline_df.add_suffix("_baseline"), comparison_df],
                axis=1
            ).reindex(columns=combined_columns)

            # Compute differences
            combined_df = compute_differences(combined_df)

            # Keep the difference columns
            difference_columns = [
                "Price_Diff", "Order_volume_Diff", "Item_Diff", 
                "Carton_volume_Diff", "volume_utilization_Diff", "total_weight_Diff"
            ]

            if combined_df.empty:
                print(f"Warning: Sheet {sheet_name} has no data. Skipping.")
                continue

            # Write the combined DataFrame to the corresponding sheet
            combined_df.to_excel(writer, sheet_name=sheet_name, index=False)

            # Apply formulas for Dimmed and Billed Weight
            workbook = writer.book
            sheet = writer.sheets[sheet_name]

            # Apply dynamic formatting
            apply_dynamic_column_formats(sheet)

            # Header formating 
            for cell in sheet[1]:
                cell.font = Font(bold=False)

            # Column width adjustments
            for col in sheet.columns:
                column_letter = col[0].column_letter
                header_length = len(str(col[0].value)) if col[0].value else 0
                column_width = max(header_length + 0.5, 13)
                sheet.column_dimensions[column_letter].width = column_width

            headers = [str(cell.value).strip() for cell in sheet[1]]
            dim_weight_col_idx = next((i + 1 for i, h in enumerate(headers) if h.startswith("dim_weight_")), None)
            total_weight_col_idx = next((i + 1 for i, h in enumerate(headers) if h.startswith("total_weight_")), None)

            if dim_weight_col_idx is None or total_weight_col_idx is None:
                print(f"Warning: Required columns for Dimmed and Billed Weight formulas not found in '{sheet_name}'. Headers: {headers}")
                continue

            last_column = len(headers)
            dimmed_col_letter = get_column_letter(last_column + 1)
            billed_weight_col_letter = get_column_letter(last_column + 2)

            sheet[f"{dimmed_col_letter}1"] = "Dimmed"
            sheet[f"{billed_weight_col_letter}1"] = "Billed_Weight"

            for row in range(2, sheet.max_row + 1):
                sheet[f"{dimmed_col_letter}{row}"] = (
                    f"=IF({get_column_letter(dim_weight_col_idx)}{row} > {get_column_letter(total_weight_col_idx)}{row}, \"Yes\", \"No\")"
                )
                sheet[f"{billed_weight_col_letter}{row}"] = (
                    f"=IF({get_column_letter(dim_weight_col_idx)}{row} > {get_column_letter(total_weight_col_idx)}{row}, "
                    f"ROUNDUP({get_column_letter(dim_weight_col_idx)}{row}, 0), ROUNDUP({get_column_letter(total_weight_col_idx)}{row}, 0))"
                )

            final_df = combined_df[difference_columns]
            
    print(f"Comparison Excel saved at {comparison_path}")

if __name__ == "__main__":
    directory = "Upload Path"
    output_directory = "Output Path"
    os.makedirs(output_directory, exist_ok=True)
    process_files(directory, output_directory)

In [None]:
import re

def refine_suffix(filename):

    match = re.search(r"pacsimulate_(\d+)\.(.*?)(?:$|\.)", filename)
    if match:
        return match.group(2)
    return None

# Test
print(refine_suffix("pacsimulate_2441.baseline.output_cartons")) 
print(refine_suffix("pacsimulate_2441.opc_top-14.output_cartons"))
print(refine_suffix("pacsimulate_2441.pacapi.output_cartons"))