In [50]:
import pandas as pd
import numpy as np

def clean_transactions(file_path):
    """
    Cleans and processes a bank/financial transaction Excel file.
    Returns a cleaned DataFrame.
    """
    # Load data
    df = pd.read_excel(file_path)
    
    # Normalize string columns
    string_cols = df.select_dtypes(include="object").columns
    for col in string_cols:
        df[col] = df[col].str.strip()
        df[col] = df[col].replace(r"^(nan|NaN|None)?$", np.nan, regex=True)
    
    # Normalize date columns
    for col in ["Transaction Date", "Processed Date"]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce").dt.date
    
    # Drop unused columns
    df.drop(columns=["Conversion Charge", "Foreign Currency Amount"], errors="ignore", inplace=True)
    
    # Remove November transactions if Transaction Date exists
    if "Transaction Date" in df.columns:
        df = df[~pd.to_datetime(df["Transaction Date"], errors="coerce").dt.month.eq(11)]
    
    # Reorder 'To/From Account Number' after 'Processed Date'
    if {"To/From Account Number", "Processed Date"}.issubset(df.columns):
        cols = df.columns.tolist()
        cols.insert(cols.index("Processed Date")+1, cols.pop(cols.index("To/From Account Number")))
        df = df[cols]
    
    # --- Remove matched reversals ---
    mask_reversal = df["Type"].str.contains("Unpaid Item Reversal|Payment Reversal|Failed Payment", case=False, na=False)
    reversal_df = df[mask_reversal].copy()
    indices_to_drop = set()
    unmatched_reversals = []

    for idx, rev_row in reversal_df.iterrows():
        rev_amount = rev_row["Amount"]
        rev_details = str(rev_row["Details"]).strip()
        rev_code = str(rev_row["Code"]).strip()
        match_mask = (
            (df["Amount"] == -rev_amount) &
            (df["Details"].astype(str).str.strip() == rev_details) &
            (df["Code"].astype(str).str.strip() == rev_code) &
            (~mask_reversal) &
            (~df.index.isin(indices_to_drop))
        )
        match = df[match_mask]
        if not match.empty:
            indices_to_drop.update([idx, match.index[0]])
        else:
            unmatched_reversals.append(rev_row)
    
    df = df.drop(index=indices_to_drop).reset_index(drop=True)
    
    print(f"⚠️ {len(indices_to_drop)//2} matched reversal pairs removed.")
    if unmatched_reversals:
        print(f"⚠️ {len(unmatched_reversals)} unmatched reversal(s) remain:")
        for row in unmatched_reversals:
            print(f"- {row['Transaction Date']} | {row['Details']} | {row['Amount']}")
    
    # --- Clean and group by Code ---
    code_clean = df["Code"].astype(str).str.strip()
    skip_codes = {"Billing", "Transfer"}
    is_numeric_code = code_clean.str.fullmatch(r"\d{3,}")
    mask_valid = code_clean.ne("") & df["Code"].notna() & ~code_clean.isin(skip_codes) & ~is_numeric_code
    df_valid = df[mask_valid].copy()
    df_conditional = df[~mask_valid].copy()
    
    agg_common = {
        "Transaction Date": "first",
        "Processed Date": "first",
        "To/From Account Number": "first",
        "Particulars": "first",
        "Balance": "first",
        "Code": "first",
        "Type": lambda x: ", ".join(sorted(set(x))),
        "Amount": "sum"
    }
    
    df_valid = df_valid.groupby("Code", as_index=False).agg(agg_common | {"Details": "first"})
    
    if not df_conditional.empty:
        df_conditional = df_conditional.groupby(["Type", "Details"], as_index=False).agg(agg_common | {"Details": "first"})
    
    df = pd.concat([df_valid, df_conditional], ignore_index=True)
    
    # Reorder columns
    desired_order = [
        "Transaction Date", "Processed Date", "Code", "Type",
        "To/From Account Number", "Details", "Particulars", "Amount", "Balance"
    ]
    df = df[[col for col in desired_order if col in df.columns]]
    
    # Label Credit/Debit
    df["Particulars"] = df["Amount"].apply(lambda x: "Credit" if x > 0 else "Debit" if x < 0 else "Zero")
    
    return df

# --- Process multiple files ---
files = ["Cap_Res_10-25.xlsx", "Pay_10-25.xlsx"]
for file in files:
    df_clean = clean_transactions(file)
    out_file = file.replace(".xlsx", "_cleaned.xlsx")
    df_clean.to_excel(out_file, index=False)
    print(f"Saved cleaned file: {out_file}")


⚠️ 0 matched reversal pairs removed.
Saved cleaned file: Cap_Res_10-25_cleaned.xlsx
⚠️ 2 matched reversal pairs removed.
Saved cleaned file: Pay_10-25_cleaned.xlsx
