### Goal
- Create an alert that will display remaining for the discount using Linear Regression
- Predict discount percentage

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re, math, json, warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Dict, Tuple
from pickle import TRUE

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [3]:
# Reading file paths
File_Path = [
    r"C://Users//pmayr//Downloads//Coles_02_04.csv",
    r"C://Users//pmayr//Downloads//Coles_03_04.csv",
    r"C://Users//pmayr//Downloads//Coles_08_04.csv",
    r"C://Users//pmayr//Downloads//Coles_09_04.csv",
    r"C://Users//pmayr//Downloads//Coles_10_04.csv",
    r"C://Users//pmayr//Downloads//Coles_17_04.csv",
    r"C://Users//pmayr//Downloads//ColesAll_17_04.csv",
    r"C://Users//pmayr//Downloads//ColesSpecial_17_04.csv",
    r"C://Users//pmayr//Downloads//Coles_02_05.csv",
    r"C://Users//pmayr//Downloads//Coles_08_05.csv",
    r"C://Users//pmayr//Downloads//Coles_15_05.csv",
    r"C://Users//pmayr//Downloads//Coles_Perth.csv"]

#Setting random state number so all outputs is consistent
random_state = 42
output_dir = Path(r"C:/Users/pmayr/Downloads/Output")
output_dir.mkdir(exist_ok=True, parents = True)

#Ensuring path exist
missing = [p for p in File_Path if not Path(p).exists()]
if missing:
    print(f"Missing files: {missing}")
    for m in missing: print("-",m)

# Creating a dictionry to ensure all the column names are the same across all sheets
# Reducing noise by choosing selected variables
col_names = {
    "sku" : ["product_code"],
    "name" : ["item_name"],
    "category": ["category", ],
    "b_price" : ["best_price"],
    "b_unit_price" : ["best_unit_price"],
    "item_price" : ["item_price"],
    "item_unit_price" : ["unit_price"],
    "original_price" : ["price_was"],
    "discount" : ["special_text"],
    "promotion" : ["promo_text"]
}

# Sorting columns to ensure that it is all set in place
KEEP_ORDER = ["sku", "name", "category", "b_price", "b_unit_price","item_price",
              "item_unit_price", "original_price", "discount", "promotion"]
          
# Checkpoints to save updates 
save_staged = True
save_dedup = True
save_final = True

In [4]:
#Creating functions to normalize the dataset ensuring everthing is consisten
def normalize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  out = df.copy()
  out.columns = [re.sub(r"[^a-z0-9]+","_", c.strip().lower()) for c in out.columns]
  return out

# 2. Function used to read csv files and normalize the column names
def read_csv(file_path: str) -> pd.DataFrame:
  df = pd.read_csv(file_path)
  return normalize_colnames(df)

# 3. Function to return the column from dataset to new column
def get_colname(df: pd.DataFrame, candidates) -> str|None:
  for c in candidates:
    c_norm = re.sub(r"[^a-z0-9]+","_", c.strip().lower())
    if c in df.columns:
      return df[c]
  return None

#4. Detecting the columsn by mapping the logical fields to the dataframe columns
def detect_cols(df: pd.DataFrame, name_map: dict) -> dict:
  return {logical: get_colname(df, cand_list) for logical, cand_list in name_map.items()}

#5. Ensuring that there is no empty cells by using the comoon schema to fill
def project_to_common_schema(df: pd.DataFrame, cmap: dict) -> pd.DataFrame:
  out = {}
  n = len(df)
  for k in KEEP_ORDER:
    src = cmap.get(k, None)
    if isinstance(src, str) and src in df.columns:
        out[k] = df[src]
    elif isinstance(src, pd.Series):
            # Fallback if an earlier detect returned a Series by mistake
        out[k] = src.reset_index(drop=True)
    else:
        out[k] = pd.Series([np.nan]*n)
  return pd.DataFrame(out)

#6. Converting all prices that are in string form to float for better visualizations
def convert_price_to_float(s: pd.Series) -> pd.Series:
  return (
      s.astype(str)
      .str.replace(",", "", regex=True)
      .str.extract(r"(\d+\.\d+)")[0]
      .astype(float)
  )

def date_from_filename(fname: str, default_year=2025):
    """Parse dd_mm from filename like Coles_02_04.csv -> 2025-04-02."""
    m = re.search(r"(\d{2})_(\d{2})", str(fname))
    if not m: return None
    day, month = int(m.group(1)), int(m.group(2))
    try: return datetime(default_year, month, day)
    except: return None

def norm_name(s: pd.Series) -> pd.Series:
    """Normalize product name for dedup: lowercase, strip units/packs."""
    x = s.astype(str).str.lower()
    x = x.str.replace(r"[^a-z0-9 ]+"," ", regex=True)
    x = x.str.replace(r"\b(\d+(\.\d+)?)(g|kg|ml|l)\b"," ", regex=True)
    x = x.str.replace(r"\bx\s*\d+\b"," ", regex=True)
    x = x.str.replace(r"\s+"," ", regex=True).str.strip()
    return x.replace({"nan": np.nan})

In [6]:
# Loading all the fiels and detecting maps and projects
loaded, detected_maps, projected_frames = {}, {}, []

missing  = [p for p in File_Path if not Path(p).exists()]
if missing:
    print("Missing files:")
    for m in missing: print(" -", m)

for fp in File_Path:
  p = Path(fp)
  if not p.exists(): 
     continue
  raw = read_csv(fp)
  loaded[fp] = raw

  cmap = detect_cols(raw, col_names)
  detected_maps[fp] = cmap

  proj = project_to_common_schema(raw, cmap)
  proj["__source_file__"] = p.name
  projected_frames.append(proj)

staged = pd.concat(projected_frames, ignore_index=True)
print("Staged shape:", staged.shape)

if save_staged:
    staged.to_csv(output_dir/"staged_merged_clean.csv", index=False)
    print("[SAVED]", output_dir/"staged_merged_clean.csv")

Staged shape: (231340, 11)
[SAVED] C:\Users\pmayr\Downloads\Output\staged_merged_clean.csv


In [7]:
# Adding scrape date, parsing prices and applying fallbacks
for c in ["b_price", "b_unit_price", "item_price", "item_unit_price", "original_p"]:
  if c in staged.columns:
    try: staged[c] = staged[c].astype(float)
    except:pass

#Applying fallbacks 
m = staged["b_unit_price"].isna() & staged["item_unit_price"].notna()
staged.loc[m, "b_unit_price"] = staged.loc[m, "item_unit_price"]

m = staged["original_price"].isna() & staged["item_price"].notna()
staged.loc[m, "original_price"] = staged.loc[m, "item_price"]

m = staged["original_price"].isna() & staged["item_price"].isna() & staged["b_price"].notna()
staged.loc[m, "original_price"] = staged.loc[m, "b_price"]

# Adding scapre data from file name
staged["scrape_date"] = staged["__source_file__"].apply(date_from_filename)
staged["scrape_date_str"] = pd.to_datetime(staged["scrape_date"], errors="coerce").dt.strftime("%Y-%m-%d")

In [8]:
# Removinf duplicate rows
staged["name_norm"] = norm_name(staged["name"])
staged["cat_norm"] = staged["category"].astype(str).str.strip().str.lower()

if staged["sku"].notna().any():
  d1 = staged.drop_duplicates(subset=["sku"], keep ="first")
  d_rest = staged[staged["sku"].isna()]
else:
  d1 = staged.copy()
  d_rest = staged.iloc[0:0]

subset_cols = ["name_norm", "cat_norm","b_price","item_price", "original_price"]
d2 = d1.drop_duplicates(subset=subset_cols, keep ="first")

dedup = pd.concat([d1, d2], ignore_index=True)
print("Deduped from: Deduped from {len(staged)} -> {len(dedup)} rows")

if save_dedup:
  dedup.to_csv(output_dir/"staged_dedup.csv", index=False)
  print("[SAVED]", output_dir/"staged_dedup.csv")

Deduped from: Deduped from {len(staged)} -> {len(dedup)} rows
[SAVED] C:\Users\pmayr\Downloads\Output\staged_dedup.csv


In [11]:
def to_price_series(s: pd.Series) -> pd.Series:
    if s is None:
        return pd.Series(dtype =float, index=dedup.index)
    s = s.astype(str).str.replace(",","", regex = False)
    num = s.str.extract(r"([-+]?\d*\.?\d+)")[0]
    return pd.to_numeric(num, errors="coerce")

for col in ["b_price", "b_unit_price", "item_price","item_unit_price","original_price"]:
    if col in dedup.columns:
        dedup[col] = to_price_series(dedup[col])


In [35]:
# Feature Engineering
fe = dedup.copy()

# discount % (safe division; clip to [0,100])
fe["disc_pct_best"] = np.where(
    fe["original_price"].notna() & (fe["original_price"] > 0) & fe["b_price"].notna(),
    (fe["original_price"] - fe["b_price"]) / fe["original_price"] * 100, np.nan
)
fe["disc_pct_item"] = np.where(
    fe["original_price"].notna() & (fe["original_price"] > 0) & fe["item_price"].notna(),
    (fe["original_price"] - fe["item_price"]) / fe["original_price"] * 100, np.nan
)
for c in ["disc_pct_best","disc_pct_item"]:
    fe.loc[~np.isfinite(fe[c]), c] = np.nan
    fe.loc[(fe[c] < -5) | (fe[c] > 100), c] = np.nan

fe["discount_percentage"] = fe[["disc_pct_best","disc_pct_item"]].max(axis=1, skipna=True).clip(0,100)

# Flags for promotions
fe["has_discount_text"] = fe.get("discount", pd.Series(index=fe.index)).notna().astype(int)
fe["has_promo_text"]    = fe.get("promotion", pd.Series(index=fe.index)).notna().astype(int)
fe["is_on_promo"] = (
    (fe["b_price"].notna() & fe["item_price"].notna() & (fe["b_price"] < fe["item_price"])) |
    (fe["has_discount_text"] == 1) | (fe["has_promo_text"] == 1)
).astype(int)

has_prices = fe["original_price"].notna() & (fe["original_price"] >= 0)
has_prices &= (fe["b_price"].notna() | fe["item_price"].notna())

mask_not_promo = has_prices & (fe["is_on_promo"] == 0)
fe.loc[mask_not_promo, "discount_percentage"] = fe.loc[mask_not_promo, "discount_percentage"].fillna(0)

fe["discount_pct_filled"] = fe["discount_percentage"].fillna(0)

print("[discount] null rate (orig):", fe["discount_percentage"].isna().mean())
print("[discount] null rate (filled):", fe["discount_pct_filled"].isna().mean())

# price gaps
fe["price_gap"]      = fe["item_price"] - fe["b_price"]
fe["unit_price_gap"] = fe["item_unit_price"] - fe["b_unit_price"]
for c in ["price_gap","unit_price_gap"]:
    fe.loc[~np.isfinite(fe[c]), c] = np.nan

# promo flags
fe["has_discount_text"] = fe.get("discount", pd.Series(index=fe.index)).notna().astype(int)
fe["has_promo_text"]    = fe.get("promotion", pd.Series(index=fe.index)).notna().astype(int)
fe["is_on_promo"] = (
    (fe["b_price"].notna() & fe["item_price"].notna() & (fe["b_price"] < fe["item_price"])) |
    (fe["has_discount_text"] == 1) | (fe["has_promo_text"] == 1)
).astype(int)

# logs for linear models
for c in ["b_price","item_price","original_price","b_unit_price","item_unit_price"]:
    fe[f"log_{c}"] = np.log1p(fe[c])

# quality flag
fe["flag_orig_lt_best"] = (
    fe["original_price"].notna() & fe["b_price"].notna() & (fe["original_price"] < fe["b_price"])
).astype(int)

[discount] null rate (orig): 0.007105189002053058
[discount] null rate (filled): 0.0


In [36]:
#Defining possible events, discount and seasons
def season_au(m):
    return {12:"summer",1:"summer",2:"summer",3:"autumn",4:"autumn",5:"autumn",
            6:"winter",7:"winter",8:"winter",9:"spring",10:"spring",11:"spring"}.get(m, "unknown")

if "scrape_date" not in fe.columns and "scrape_date_str" in fe.columns:
    fe["scrape_date"] = pd.to_datetime(fe["scrape_date_str"], errors="coerce")

fe["season"] = fe["scrape_date"].dt.month.apply(season_au)

EVENT_KEYWORDS = [
    "easter","chocolate","egg","holiday","bbq","footy","school",
    "mother","father","christmas","xmas","ramadan","eid",
    "summer","winter","spring","autumn",
    "half price","2 for","buy one get one","bogo","special","clearance"
]

def keyword_hits_from_row(row):
    blob = " ".join([
        str(row.get("promotion","")).lower(),
        str(row.get("discount","")).lower(),
        str(row.get("category","")).lower(),
        str(row.get("name","")).lower()
    ])
    return sorted({kw for kw in EVENT_KEYWORDS if kw in blob})

fe["event_tags"]    = fe.apply(keyword_hits_from_row, axis=1)
fe["has_event_tag"] = fe["event_tags"].apply(lambda lst: 1 if len(lst)>0 else 0)

easter_ref = datetime(2025, 3, 31)
fe["is_easter_window"] = np.where(
    fe["scrape_date"].notna() & (fe["scrape_date"].sub(easter_ref).abs().dt.days <= 7), 1, 0
)

In [37]:
from collections import Counter

def learn_prefix_brands(series, min_count=15):
    tokens = (
        series.astype(str).str.lower()
              .str.replace(r"[^a-z0-9 ]+"," ", regex=True)
              .str.strip().str.split().str[0]
    )
    freq = Counter(tokens.dropna())
    return {w for w,c in freq.items() if c >= min_count and len(w) > 2}

STORE_BRANDS     = {"coles","coles bakery","coles finest","coles kitchen"}
MULTIWORD_BRANDS = {"golden circle","uncle tobys","four n twenty","san remo","san pellegrino"}
SINGLEWORD_SEED  = {"maybelline","sensodyne","mcvities","arnotts","pampers","nivea","oreo","vaseline","panadol","dettol","heinz","lindt"}

fallback_single = learn_prefix_brands(fe["name"], min_count=15)

def extract_brand(name: str):
    if not isinstance(name, str) or not name.strip(): return (None, "none")
    s = re.sub(r"\s+"," ", name.strip().lower()); tokens = s.split()
    if "coles" in s: return ("coles","store_brand")
    if len(tokens)>=2 and " ".join(tokens[:2]) in MULTIWORD_BRANDS: return (" ".join(tokens[:2]), "multiword_exact")
    first = tokens[0]
    if first in SINGLEWORD_SEED:  return (first, "singleword_seed")
    if first in fallback_single:  return (first, "singleword_freq")
    return (None, "none")

b = fe["name"].apply(extract_brand)
fe["brand_clean"]      = b.apply(lambda t: t[0])
fe["brand_confidence"] = b.apply(lambda t: t[1])
fe["brand_tier"]       = np.where(
    fe["brand_clean"].isna(), "unbranded",
    np.where(fe["brand_clean"].isin(STORE_BRANDS), "store", "branded")
)

def parse_size_advanced(name: str):
    if not isinstance(name, str) or not name.strip():
        return (None, None)
    s = name.lower().replace("×", "x")

    # --- 1) multi-pack with explicit unit per item: "3 x 200g", "2x250 ml"
    m = re.search(r"(\d+)\s*x\s*(\d+(\.\d+)?)\s*(kg|g|l|ml)\b", s)
    if m:
        pack_count = float(m.group(1))
        per_qty = float(m.group(2))
        unit = m.group(4)
        # normalize to base units
        if unit == "kg": per_qty *= 1000; unit = "g"
        if unit == "l":  per_qty *= 1000; unit = "ml"
        # total size across pack; also allow combinability by using per-unit below if you prefer
        total_qty = pack_count * per_qty
        return (total_qty, unit)

    # --- 2) single sized item: "750ml", "2 L", "500 g"
    m = re.search(r"(\d+(\.\d+)?)\s*(kg|g|l|ml)\b", s)
    if m:
        qty = float(m.group(1)); unit = m.group(3)
        if unit == "kg": qty *= 1000; unit = "g"
        if unit == "l":  qty *= 1000; unit = "ml"
        return (qty, unit)

    # --- 3) pack/each without a size: "6 pack", "12pk", "x6"
    m = re.search(r"\b(\d+)\s*(pack|pk)\b", s)
    if m:
        return (float(m.group(1)), "each")
    m = re.search(r"\bx\s*(\d+)\b", s)
    if m:
        return (float(m.group(1)), "each")
    if "each" in s:
        return (1.0, "each")

    return (None, None)

# apply
sz2 = fe["name"].apply(parse_size_advanced)
fe["size_value"] = sz2.apply(lambda t: t[0])
fe["size_unit"]  = sz2.apply(lambda t: t[1])

# consistent bands
def size_band(row):
    v,u = row["size_value"], row["size_unit"]
    if pd.isna(v) or pd.isna(u): return "mixed"
    if u in ("g","ml"):
        if v < 300: return "small"
        if v <= 1200: return "medium"
        return "large"
    if u == "each":
        if v <= 2: return "small"
        if v <= 6: return "medium"
        return "large"
    return "mixed"

fe["size_band"] = fe.apply(size_band, axis=1)

# combinable size (use *per-unit* for multi-pack if you prefer)
# current choice: use total weight/volume; 'each' uses count
fe["combinable_size"] = np.where(
    fe["size_unit"].isin(["g","ml"]), fe["size_value"],
    np.where(fe["size_unit"].eq("each"), fe["size_value"], np.nan)
)

print("[size] coverage now:", fe["size_unit"].value_counts(dropna=False).to_dict())

[size] coverage now: {None: 41899, 'ml': 3623, 'each': 2560, 'g': 1600}


In [38]:
def parse_size(name: str):
    if not isinstance(name, str): return (None, None)
    s = name.lower()
    m = re.search(r"(\d+(\.\d+)?)\s*(kg|g|l|ml)\b", s)
    if m:
        qty = float(m.group(1)); unit = m.group(3)
        return (qty*1000, "g") if unit=="kg" else (qty, "g") if unit=="g" else \
               (qty*1000, "ml") if unit=="l" else (qty, "ml")
    m = re.search(r"\bx\s*(\d+)\b", s)  # packs like "x2"
    if m: return (float(m.group(1)), "each")
    if "each" in s: return (1.0, "each")
    return (None, None)

sz = fe["name"].apply(parse_size)
fe["size_value"] = sz.apply(lambda t: t[0])
fe["size_unit"]  = sz.apply(lambda t: t[1])

def size_band(row):
    v,u = row["size_value"], row["size_unit"]
    if pd.isna(v) or pd.isna(u): return "mixed"
    if u in ("g","ml"):
        if v < 300: return "small"
        if v <= 1200: return "medium"
        return "large"
    if u=="each":
        if v <= 2: return "small"
        if v <= 6: return "medium"
        return "large"
    return "mixed"

fe["size_band"] = fe.apply(size_band, axis=1)
fe["combinable_size"] = np.where(
    fe["size_unit"].isin(["g","ml"]), fe["size_value"],
    np.where(fe["size_unit"].eq("each"), fe["size_value"], np.nan)
)


In [47]:
#Exporting
export_cols = [
    # identity / traceability
    "sku","name","category","__source_file__","scrape_date_str",
    # prices
    "b_price","item_price","original_price","b_unit_price","item_unit_price",
    # engineered
    "disc_pct_best","disc_pct_item","discount_percentage","price_gap","unit_price_gap","is_on_promo",
    "discount","promotion",
    "log_b_price","log_item_price","log_original_price","log_b_unit_price","log_item_unit_price",
    "flag_orig_lt_best",
    # events
    "season","event_tags","has_event_tag","is_easter_window",
    # brands
    "brand_clean","brand_confidence","brand_tier",
    # sizes
    "size_value","size_unit","size_band","combinable_size"
]
export_cols = [c for c in export_cols if c in fe.columns]
final_df = fe[export_cols].copy()

if save_final:
    final_df.to_csv(output_dir/"staged_features_events_brands_size.csv", index=False)
    final_df.to_parquet(output_dir/"staged_features_events_brands_size.parquet", index=False)
    final_df.sample(min(1000, len(final_df)), random_state=42).to_csv(output_dir/"features_sample_1k.csv", index=False)
    print("[SAVED]", output_dir/"staged_features_events_brands_size.csv")
    print("[SAVED]", output_dir/"staged_features_events_brands_size.parquet")
    print("[SAVED]", output_dir/"features_sample_1k.csv")

# quick sanity printouts for reviewers
print("[fe] shape:", fe.shape)
if "brand_tier" in fe:
    print("[brand_tier %]", fe["brand_tier"].value_counts(normalize=True, dropna=False).round(3).to_dict())
if "has_event_tag" in fe:
    print("[has_event_tag=1]", int(fe["has_event_tag"].sum()))
if "size_band" in fe:
    print("[size_band %]", fe["size_band"].value_counts(normalize=True, dropna=False).round(3).to_dict())

[SAVED] C:\Users\pmayr\Downloads\Output\staged_features_events_brands_size.csv
[SAVED] C:\Users\pmayr\Downloads\Output\staged_features_events_brands_size.parquet
[SAVED] C:\Users\pmayr\Downloads\Output\features_sample_1k.csv
[fe] shape: (49682, 41)
[brand_tier %] {'branded': 0.671, 'unbranded': 0.326, 'store': 0.003}
[has_event_tag=1] 3713
[size_band %] {'mixed': 0.89, 'medium': 0.063, 'small': 0.038, 'large': 0.009}


# Testing the dataset

In [48]:
output_dir = Path(r"C:/Users/pmayr/Downloads/Output") 
final_csv = output_dir / "staged_features_events_brands_size.csv"
df = pd.read_csv(final_csv, low_memory=False)
print("[loaded]", final_csv, "| shape:", df.shape)
df.head()

[loaded] C:\Users\pmayr\Downloads\Output\staged_features_events_brands_size.csv | shape: (49682, 35)


Unnamed: 0,sku,name,category,__source_file__,scrape_date_str,b_price,item_price,original_price,b_unit_price,item_unit_price,disc_pct_best,disc_pct_item,discount_percentage,price_gap,unit_price_gap,is_on_promo,discount,promotion,log_b_price,log_item_price,log_original_price,log_b_unit_price,log_item_unit_price,flag_orig_lt_best,season,event_tags,has_event_tag,is_easter_window,brand_clean,brand_confidence,brand_tier,size_value,size_unit,size_band,combinable_size
0,8371390,Coles Hot Cross Buns Traditional Fruit | 6 Pack,Easter,Coles_02_04.csv,2025-04-02,3.0,4.4,4.4,0.73,0.73,31.818182,0.0,31.818182,1.4,0.0,1,SPECIAL,2 for $6,1.386294,1.686399,1.686399,0.548121,0.548121,0,autumn,"['2 for', 'easter', 'special']",1,1,coles,store_brand,store,,,mixed,
1,7473849,Coles Hot Cross Buns Choc Chip | 6 Pack,Easter,Coles_02_04.csv,2025-04-02,3.0,4.4,4.4,0.73,0.73,31.818182,0.0,31.818182,1.4,0.0,1,SPECIAL,2 for $6,1.386294,1.686399,1.686399,0.548121,0.548121,0,autumn,"['2 for', 'easter', 'special']",1,1,coles,store_brand,store,,,mixed,
2,5726070,Coles Hot Cross Buns Traditional Fruit Mini | ...,Easter,Coles_02_04.csv,2025-04-02,3.0,4.4,4.4,0.49,0.49,31.818182,0.0,31.818182,1.4,0.0,1,SPECIAL,2 for $6,1.386294,1.686399,1.686399,0.398776,0.398776,0,autumn,"['2 for', 'easter', 'special']",1,1,coles,store_brand,store,,,mixed,
3,4885191,Cadbury Dairy Milk Easter Chocolate Eggs Bag |...,Easter,Coles_02_04.csv,2025-04-02,6.7,6.7,6.7,5.88,5.88,0.0,0.0,0.0,0.0,0.0,0,,,2.04122,2.04122,2.04122,1.928619,1.928619,0,autumn,"['chocolate', 'easter', 'egg']",1,1,cadbury,singleword_freq,branded,114.0,g,small,114.0
4,3378370,Coles Hot Cross Buns Apple & Cinnamon | 6 Pack,Easter,Coles_02_04.csv,2025-04-02,3.0,4.4,4.4,0.73,0.73,31.818182,0.0,31.818182,1.4,0.0,1,SPECIAL,2 for $6,1.386294,1.686399,1.686399,0.548121,0.548121,0,autumn,"['2 for', 'easter', 'special']",1,1,coles,store_brand,store,,,mixed,


In [49]:
print("\n[dtypes]")
print(df.dtypes.sort_index())

required_cols = [
    "sku","name","category",
    "b_price","item_price","original_price",
    "b_unit_price","item_unit_price",
    "discount_percentage","is_on_promo",
    "brand_tier","size_band","season",
]
missing_cols = [c for c in required_cols if c not in df.columns]
print("\n[required cols present?]", "OK" if not missing_cols else f"Missing: {missing_cols}")



[dtypes]
__source_file__         object
b_price                float64
b_unit_price           float64
brand_clean             object
brand_confidence        object
brand_tier              object
category                object
combinable_size        float64
disc_pct_best          float64
disc_pct_item          float64
discount                object
discount_percentage    float64
event_tags              object
flag_orig_lt_best        int64
has_event_tag            int64
is_easter_window         int64
is_on_promo              int64
item_price             float64
item_unit_price        float64
log_b_price            float64
log_b_unit_price       float64
log_item_price         float64
log_item_unit_price    float64
log_original_price     float64
name                    object
original_price         float64
price_gap              float64
promotion               object
scrape_date_str         object
season                  object
size_band               object
size_unit               objec

In [50]:
null_rate = df.isna().mean().sort_values(ascending=False)
print("\n[top null rates]")
print((null_rate.head(15)*100).round(1).astype(str) + "%")



[top null rates]
discount               90.4%
combinable_size        89.0%
size_unit              89.0%
size_value             89.0%
promotion              77.0%
disc_pct_best          72.4%
disc_pct_item          72.4%
brand_clean            32.7%
unit_price_gap          4.6%
log_item_unit_price     4.6%
b_unit_price            4.6%
item_unit_price         4.6%
log_b_unit_price        4.6%
scrape_date_str         4.4%
discount_percentage     0.7%
dtype: object


In [51]:
# discount % in [0,100]
print("\n[discount % range]", float(df["discount_percentage"].min()), "→", float(df["discount_percentage"].max()))

# Flag price inversions (orig < best)
bad = df[(df["original_price"].notna()) & (df["b_price"].notna()) & (df["original_price"] < df["b_price"])]
print("[orig < best] rows:", len(bad))

# unit price sanity (optional, looser)
u_bad = df[(df["b_unit_price"].notna()) & (df["item_unit_price"].notna()) & (df["b_unit_price"] > df["item_unit_price"]*2)]
print("[b_unit_price > 2× item_unit_price] rows:", len(u_bad))


[discount % range] 0.0 → 84.7457627118644
[orig < best] rows: 35961
[b_unit_price > 2× item_unit_price] rows: 0


In [52]:
tot = len(df)
dup_sku = df["sku"].duplicated(keep=False).sum() if "sku" in df else 0
dup_combo = df.duplicated(subset=["name","category"], keep=False).sum() if set(["name","category"]).issubset(df.columns) else 0
print(f"\n[dupe check] of {tot} rows -> dup_sku={dup_sku}, dup_name+category={dup_combo}")



[dupe check] of 49682 rows -> dup_sku=49570, dup_name+category=49673


In [53]:
def pct_counts(s, top=8):
    vc = s.value_counts(dropna=False)
    pct = (vc / vc.sum() * 100).round(1).astype(str) + "%"
    return pd.DataFrame({"count": vc.head(top), "%": pct.head(top)})

print("\n[category]")
display(pct_counts(df["category"]))
print("\n[season]")
display(pct_counts(df["season"]))
print("\n[brand_tier]")
display(pct_counts(df["brand_tier"]))
print("\n[size_band]")
display(pct_counts(df["size_band"]))



[category]


Unnamed: 0_level_0,count,%
category,Unnamed: 1_level_1,Unnamed: 2_level_1
HAIR CARE,2101,4.2%
SKIN CARE,1276,2.6%
MEDICINAL PRODUCTS,1226,2.5%
PET FOOD,1206,2.4%
ASIAN FOODS,1163,2.3%
WINE,1103,2.2%
VITAMINS,1045,2.1%
HEALTH FOODS,993,2.0%



[season]


Unnamed: 0_level_0,count,%
season,Unnamed: 1_level_1,Unnamed: 2_level_1
autumn,47503,95.6%
unknown,2179,4.4%



[brand_tier]


Unnamed: 0_level_0,count,%
brand_tier,Unnamed: 1_level_1,Unnamed: 2_level_1
branded,33320,67.1%
unbranded,16204,32.6%
store,158,0.3%



[size_band]


Unnamed: 0_level_0,count,%
size_band,Unnamed: 1_level_1,Unnamed: 2_level_1
mixed,44227,89.0%
medium,3126,6.3%
small,1899,3.8%
large,430,0.9%


In [54]:
print("\n[event tag coverage]")
print(df["has_event_tag"].value_counts(dropna=False).to_frame("count"))

if "event_tags" in df.columns:
    # peek at top co-occurring tags
    from collections import Counter
    tags = df["event_tags"].dropna().astype(str).tolist()
    tags = [t.strip("[]").replace("'", "").split(", ") for t in tags]
    flat = [w for lst in tags for w in lst if w]
    print("top tags:", Counter(flat).most_common(10))



[event tag coverage]
               count
has_event_tag       
0              45969
1               3713
top tags: [('chocolate', 2000), ('egg', 517), ('bbq', 392), ('special', 358), ('2 for', 309), ('easter', 298), ('spring', 168), ('school', 96), ('summer', 75), ('footy', 66)]


In [55]:
# median discount% by category (top 10 by count)
top_cats = df["category"].value_counts().head(10).index
cat_disc = df[df["category"].isin(top_cats)].groupby("category")["discount_percentage"].median().sort_values(ascending=False)
print("\n[median discount by top categories]")
print(cat_disc.round(2))

# median discount by brand tier
if "brand_tier" in df.columns:
    print("\n[median discount by brand_tier]")
    print(df.groupby("brand_tier")["discount_percentage"].median().round(2))



[median discount by top categories]
category
ASIAN FOODS           0.0
BISCUITS & COOKIES    0.0
CONFECTIONERY         0.0
HAIR CARE             0.0
HEALTH FOODS          0.0
MEDICINAL PRODUCTS    0.0
PET FOOD              0.0
SKIN CARE             0.0
VITAMINS              0.0
WINE                  0.0
Name: discount_percentage, dtype: float64

[median discount by brand_tier]
brand_tier
branded      0.0
store        0.0
unbranded    0.0
Name: discount_percentage, dtype: float64


In [56]:
# Does every row have a date string?
if "scrape_date_str" in df.columns:
    print("\n[dates] unique scrape_date_str:", df["scrape_date_str"].nunique())
else:
    print("\n[dates] scrape_date_str not found (optional)")

# Traceability: make sure file source is present
print("[trace] __source_file__ present?", "__source_file__" in df.columns)



[dates] unique scrape_date_str: 9
[trace] __source_file__ present? True


In [57]:
summary = {
    "rows": len(df),
    "cols": df.shape[1],
    "null_rate_discount%": float(df["discount_percentage"].isna().mean()) if "discount_percentage" in df else None,
    "null_rate_prices": float(df[["b_price","item_price","original_price"]].isna().mean().mean()),
    "dup_sku": int(df["sku"].duplicated().sum()) if "sku" in df else None,
    "brand_tier_dist": df["brand_tier"].value_counts(normalize=True, dropna=False).round(3).to_dict() if "brand_tier" in df else {},
    "size_band_dist": df["size_band"].value_counts(normalize=True, dropna=False).round(3).to_dict() if "size_band" in df else {},
    "event_tag_rate": float(df["has_event_tag"].mean()) if "has_event_tag" in df else None,
}
qa_path = output_dir / "QA_summary.json"
import json; json.dump(summary, open(qa_path, "w"), indent=2)
print("[SAVED QA]", qa_path, "\n", json.dumps(summary, indent=2))


[SAVED QA] C:\Users\pmayr\Downloads\Output\QA_summary.json 
 {
  "rows": 49682,
  "cols": 35,
  "null_rate_discount%": 0.007105189002053058,
  "null_rate_prices": 0.0,
  "dup_sku": 24785,
  "brand_tier_dist": {
    "branded": 0.671,
    "unbranded": 0.326,
    "store": 0.003
  },
  "size_band_dist": {
    "mixed": 0.89,
    "medium": 0.063,
    "small": 0.038,
    "large": 0.009
  },
  "event_tag_rate": 0.0747353166136629
}
