In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import os

# 2. Config
files_to_process = ["uncleaned.xlsx"]
output_folder = "cleaned_transac"
os.makedirs(output_folder, exist_ok=True)

# 3. Process + Clean Files
for file in files_to_process: 
    print(f"\nProcessing file: {file}")

    # Load Data
    df = pd.read_excel(file)
    df.info()
    df.head()  # Optional: df.head(10)

    # Normalize string columns
    string_cols = df.select_dtypes(include="object").columns
    for col in string_cols:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace(r"^(nan|NaN|None)?$", np.nan, regex=True)

    # Normalize date columns
    for col in ["Transaction Date", "Processed Date"]:
        if col in df.columns:
            # use errors="coerce" to avoid conversion errors
            df[col] = pd.to_datetime(df[col], errors="coerce").dt.date

    # Drop unnecessary columns
    df.drop(columns=["Conversion Amount", "Foreign Currency Amount"], errors="ignore", inplace=True)

    # Remove Nov columns
    if "Transaction Date" in df.columns:
        df = df[~pd.to_datetime(df["Transaction Date"], errors="coerce").dt.month.eq(11)]

    # # Reorder columns (optional)
    # if {"To/From Account Number", "Processed Date"}.issubset(df.columns):
    #     cols = df.columns.tolist()
    #     cols.insert(cols.index("Processed Date")+1, cols.pop(cols.index("To/From Account Number")))
    #     df = df[cols]

    # 4. Remove reversed transactions/balance adjustments
    mask_reversal = df["Type"].str.contains(
        "Unpaid Item Reversal|Payment Reversal|Failed Payment",
        case=False, na=False
    )
    reversal_df = df[mask_reversal].copy()
    indices_to_drop = set()
    unmatched_reversals = []

    for idx, rev_row in reversal_df.iterrows():
        rev_amount = rev_row["Amount"]
        rev_details = str(rev_row["Details"]).strip()
        rev_code = str(rev_row["Code"]).strip()

        # Find matching transaction for this reversal 
        # (equal but opposite amount, matching details & code, not flagged as reversal, not already dropped)
        match_mask = (
            (df["Amount"] == -rev_amount) &
            (df["Details"].astype(str).str.strip() == rev_details) &
            (df["Code"].astype(str).str.strip() == rev_code) &
            (~mask_reversal) &
            (~df.index.isin(indices_to_drop))
        )

        match = df[match_mask]
        if not match.empty:
            # If a match is found, mark both the reversal and its pair for removal
            match_idx = match.index[0]
            indices_to_drop.update([idx, match_idx])
        else:
            # If no match, add this reversal to the "unmatched" list for review
            unmatched_reversals.append(rev_row)

    df = df.drop(index=list(indices_to_drop)).reset_index(drop=True)
    # Each reversal removal involves two transactions, we divide by 2 to report the number of pairs
    print(f"⚠️ {len(indices_to_drop)//2} reversal pairs removed from {file}")

    if unmatched_reversals:
        print(f"⚠️ {len(unmatched_reversals)} unmatched reversal(s) remain in {file}")

# 5. Group by code
code_clean = df["Code"].astype(str).str.strip()
skip_codes = {"Billing", "Transfer", "Deposit"}  # Added Deposit
is_numeric_code = code_clean.str.fullmatch(r"\d{3,}")

mask_valid = (
    code_clean.ne("") &
    df["Code"].notna() &
    ~code_clean.isin(skip_codes) &
    ~is_numeric_code
)

df_valid = df[mask_valid].copy()
df_conditional = df[~mask_valid].copy()

agg_common = {
        "Transaction Date": "first",
        "Processed Date": "first",
        "To/From Account Number": "first",
        "Particulars": "first",
        "Balance": "first",
        "Code": "first",
        "Type": lambda x: ", ".join(sorted(set(x))),
        "Amount": "sum"
    }

# Valid codes grouped by Code
df_valid = df_valid.groupby("Code", as_index=False).agg({**agg_common, "Details": "first"})

# Conditional codes (Billing, Transfer, Deposit) grouped by Type + Account
if not df_conditional.empty:
    df_conditional = (
        df_conditional.groupby(["Type", "To/From Account Number"], as_index=False)
        .agg({**agg_common, "Details": "first"})
    )

df = pd.concat([df_valid, df_conditional], ignore_index=True)

# 6. Final Touches
desired_order = [
        "Transaction Date",
        "Processed Date",
        "Code",
        "Type",
        "To/From Account Number",
        "Details",
        "Particulars",
        "Amount",
        "Balance",
    ]

df = df[[col for col in desired_order if col in df.columns]]

df["Particulars"] = df["Amount"].apply(
    lambda x: "Credit" if x > 0 else "Debit" if x < 0 else ""
)

# 7. Export
cleaned_file_path = os.path.join(output_folder, "cleaned.xlsx")
df.to_excel(cleaned_file_path, index=False)
print(f"✅ Cleaned file saved: {cleaned_file_path}")




Processing file: uncleaned.xlsx
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Transaction Date        31 non-null     object 
 1   Processed Date          31 non-null     object 
 2   Code                    31 non-null     object 
 3   Type                    31 non-null     object 
 4   To/From Account Number  31 non-null     object 
 5   Details                 31 non-null     object 
 6   Particulars             31 non-null     object 
 7   Amount                  31 non-null     float64
 8   Balance                 31 non-null     float64
dtypes: float64(2), object(7)
memory usage: 2.3+ KB
⚠️ 1 reversal pairs removed from uncleaned.xlsx
✅ Cleaned file saved: cleaned_transac\cleaned.xlsx
