In [None]:
# clean_all_retailers_to_output.py
#
# Cleans 4 files (Coles, Woolies, Foodland, IGA) in the current folder
# and writes results into an "output/" folder.
# For each retailer:
#   - always save {retailer}_no_multibuy.csv
#   - save {retailer}_multibuy_only.csv only if multibuy rows exist

# Link to the 4 raw CSVs: https://drive.google.com/drive/folders/1Ws9qCe5kDvnz0Mcu2zAk1up7ck0F5RG8?usp=sharing

import pandas as pd
import re
import os

# Pulls the last number from a price-like string.
# Works for "$1.54", "was $5.55", "5.10 per L", etc.
def parse_money(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        return float(x)
    s = str(x).replace(",", "")
    m = re.findall(r"-?\d+(?:\.\d+)?", s)
    return float(m[-1]) if m else None

# Turns unit-price text into one comparable number:
# returns price per 100g / 100ml, or price per each
def parse_unit_price(s: str):
    if not isinstance(s, str):
        return None
    txt = s.lower().strip()
    m = re.search(r"(\d*\.?\d+)\s*(?:per|/)\s*(\d*\.?\d+)?\s*(g|kg|ml|l|each|ea|count)", txt)
    if not m:
        return None
    price = float(m.group(1))
    qty   = float(m.group(2)) if m.group(2) else 1.0
    unit  = m.group(3)
    if unit == "kg":
        qty, unit = qty * 1000.0, "g"
    if unit == "l":
        qty, unit = qty * 1000.0, "ml"
    if unit in ("g", "ml"):
        return price * (100.0 / qty)   # per 100g/ml
    return price / qty                 # per each

# Parses "2 for $5" style promos
def parse_multibuy(s: str):
    if not isinstance(s, str):
        return None, None, None
    m = re.search(r"(\d+)\s+for\s+\$?(\d*\.?\d+)", s.lower())
    if not m:
        return None, None, None
    qty = int(m.group(1))
    total = float(m.group(2))
    each = total / qty if qty else None
    return qty, total, each

def clean_and_split_one(input_file: str, retailer_tag: str, out_dir: str = "output"):
    # read from current folder; write into output/
    if not os.path.exists(input_file):
        print(f"skip (not found): {input_file}")
        return
    os.makedirs(out_dir, exist_ok=True)

    print(f"\ncleaning {retailer_tag} from {input_file} ...")

    # keep_default_na=False so text like "N/A" stays as text (some feeds use this)
    df = pd.read_csv(input_file, encoding="utf-8", low_memory=False, keep_default_na=False)

    # simple column names: lowercase_with_underscores
    df.columns = [re.sub(r"\W+", "_", c.strip().lower()).strip("_") for c in df.columns]

    # numeric prices (robust to "was $X", "$Y", etc.)
    if "item_price" in df.columns:
        df["item_price_num"] = df["item_price"].apply(parse_money)
    if "best_price" in df.columns:
        df["best_price_num"] = df["best_price"].apply(parse_money)

    # standardised unit price from text
    df["unit_price_std"] = None
    if "unit_price" in df.columns:
        df["unit_price_std"] = df["unit_price"].apply(parse_unit_price)
    if "best_unit_price" in df.columns:
        df["unit_price_std"] = pd.Series(df["unit_price_std"]).fillna(
            df["best_unit_price"].apply(parse_unit_price)
        )

    # pull multibuy info out of promo text (check whichever of these exist)
    promo_cols = [c for c in ["special_text", "promo_text", "complex_promo_text"] if c in df.columns]
    if promo_cols:
        qtys, totals, eachs = [], [], []
        for i in range(len(df)):
            q = t = e = None
            for c in promo_cols:
                q, t, e = parse_multibuy(df.at[i, c])
                if q is not None:
                    break
            qtys.append(q); totals.append(t); eachs.append(e)
        df["multibuy_qty"] = qtys
        df["multibuy_total_price"] = totals
        df["multibuy_each_price"] = eachs

    # keep only the columns we actually use in visuals
    keep = [
        "category", "item_name",
        "item_price_num", "best_price_num", "unit_price_std",
        "multibuy_qty", "multibuy_total_price", "multibuy_each_price"
    ]
    df = df[[c for c in keep if c in df.columns]]

    # always create a "no multibuy" file; "multibuy only" file only if it exists
    has_mb_col = "multibuy_qty" in df.columns
    if has_mb_col:
        df_mb  = df[df["multibuy_qty"].notna()].copy()
        df_nmb = df[df["multibuy_qty"].isna()].copy()
    else:
        df_mb  = pd.DataFrame(columns=df.columns)   # empty
        df_nmb = df.copy()

    # drop multibuy columns from the non-multibuy file so it stays tidy
    df_nmb = df_nmb.drop(columns=["multibuy_qty", "multibuy_total_price", "multibuy_each_price"], errors="ignore")

    # save to output/
    nmb_path = os.path.join(out_dir, f"{retailer_tag}_no_multibuy.csv")
    df_nmb.to_csv(nmb_path, index=False)
    print(f"saved → {nmb_path} (rows={len(df_nmb)})")

    if len(df_mb) > 0:
        mb_path = os.path.join(out_dir, f"{retailer_tag}_multibuy_only.csv")
        df_mb.to_csv(mb_path, index=False)
        print(f"saved → {mb_path} (rows={len(df_mb)})")
    else:
        print("no multibuy rows found → only no_multibuy file saved")

if __name__ == "__main__":
    # read inputs from current folder; write outputs into ./output
    files = {
        "coles_raw.csv":     "coles",
        "woolies_raw.csv":   "woolies",
        "Foodland_raw.csv":  "foodland",
        "IGA_raw.csv":       "iga",
    }
    for fname, tag in files.items():
        clean_and_split_one(fname, tag, out_dir="output")
    print("\nall done. check the 'output/' folder for up to 8 files (2 per retailer if multibuy exists).")



cleaning coles from coles_raw.csv ...
saved → output\coles_no_multibuy.csv (rows=26878)
saved → output\coles_multibuy_only.csv (rows=1921)

cleaning woolies from woolies_raw.csv ...
saved → output\woolies_no_multibuy.csv (rows=43224)
saved → output\woolies_multibuy_only.csv (rows=1508)

cleaning foodland from Foodland_raw.csv ...
saved → output\foodland_no_multibuy.csv (rows=480)
no multibuy rows found → only no_multibuy file saved

cleaning iga from IGA_raw.csv ...
saved → output\iga_no_multibuy.csv (rows=5373)
no multibuy rows found → only no_multibuy file saved

all done. check the 'output/' folder for up to 8 files (2 per retailer if multibuy exists).
