### Pre-Merging Dataset
- Cleaning the column names
- Ensuring the dataset fit in the set columns

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re, math, json, warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Dict, Tuple, Optional

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [5]:
# Reading file paths
file_path = [
    r"C://Users//pmayr//Downloads//Coles_02_04.csv",
    r"C://Users//pmayr//Downloads//Coles_03_04.csv",
    r"C://Users//pmayr//Downloads//Coles_08_04.csv",
    r"C://Users//pmayr//Downloads//Coles_09_04.csv",
    r"C://Users//pmayr//Downloads//Coles_10_04.csv",
    r"C://Users//pmayr//Downloads//Coles_17_04.csv",
    r"C://Users//pmayr//Downloads//ColesAll_17_04.csv",
    r"C://Users//pmayr//Downloads//ColesSpecial_17_04.csv",
    r"C://Users//pmayr//Downloads//Coles_02_05.csv",
    r"C://Users//pmayr//Downloads//Coles_08_05.csv",
    r"C://Users//pmayr//Downloads//Coles_15_05.csv",
    r"C://Users//pmayr//Downloads//Coles_Perth.csv",
]

#Setting random state number so all outputs is consistent
random_state = 42
output_dir = Path(r"C:/Users/pmayr/Downloads/Output")
output_dir.mkdir(exist_ok=True, parents = True)

#Ensuring path exist
missing = [p for p in file_path if not Path(p).exists()]
if missing:
    print(f"Missing files: {missing}")
    for m in missing: print("-",m)

# Creating a dictionry to ensure all the column names are the same across all sheets
# Reducing noise by choosing selected variables
col_names = {
    "sku" : ["product_code"],
    "name" : ["item_name"],
    "category": ["category", ],
    "b_price" : ["best_price"],
    "b_unit_price" : ["best_unit_price"],
    "item_price" : ["item_price"],
    "item_unit_price" : ["unit_price"],
    "original_price" : ["price_was"],
    "discount" : ["special_text"],
    "promotion" : ["promo_text"]
}

In [6]:
'''
1. Function used to ensure that all column names are standardized
   All text are converted to lowercase, underscores and space removed
'''
def normalize_colnames(df: pd.DataFrame) -> pd.DataFrame:
  out = df.copy()
  out.columns = [re.sub(r"[^a-z0-9]+","_", c.strip().lower()) for c in out.columns]
  return out


# 2. Function used to read csv files and normalize the column names
def read_csv(file_path: str) -> pd.DataFrame:
  df = pd.read_csv(file_path, low_memory =False)
  return normalize_colnames(df)

# 3. Function to return the column from dataset to new column
def get_col(df: pd.DataFrame, candidates: List[str]) -> Optional[pd.Series]:
  for c in candidates:
    if c in df.columns:
      return df[c]
  return None

# 4. Function to return the column Name 
def get_colname(df: pd.DataFrame, candidates: List[str]) -> Optional[str]:
    for c in candidates:
        if c in df.columns:
            return c
    return None


In [7]:
# This function is used to map the field to the column in df
def detect_column(df: pd.DataFrame, name_map: Dict[str, List[str]]) -> Dict[str, Optional[str]]:
    return {logical: get_colname(df, cand_list) for logical, cand_list in name_map.items()}

loaded = {}
detected_maps = {}

for fp in file_path:
    if not Path(fp).exists():
        continue
    raw = read_csv(fp)
    loaded[fp] = raw
    detected_maps[fp] = detect_column(raw, col_names)

print(f"Loaded files : {len(loaded)} files.")
pd.DataFrame(
    [{"file": Path(p).name, **detected_maps[p]} for p in detected_maps]
)

Loaded files : 12 files.


Unnamed: 0,file,sku,name,category,b_price,b_unit_price,item_price,item_unit_price,original_price,discount,promotion
0,Coles_02_04.csv,product_code,item_name,category,best_price,best_unit_price,item_price,unit_price,price_was,special_text,promo_text
1,Coles_03_04.csv,product_code,item_name,category,best_price,best_unit_price,item_price,unit_price,price_was,special_text,promo_text
2,Coles_08_04.csv,product_code,item_name,category,best_price,best_unit_price,item_price,unit_price,price_was,special_text,promo_text
3,Coles_09_04.csv,product_code,item_name,category,best_price,,item_price,unit_price,,special_text,promo_text
4,Coles_10_04.csv,product_code,item_name,category,best_price,,item_price,unit_price,,special_text,promo_text
5,Coles_17_04.csv,product_code,item_name,category,best_price,,item_price,unit_price,,special_text,promo_text
6,ColesAll_17_04.csv,product_code,item_name,category,best_price,,item_price,unit_price,,special_text,promo_text
7,ColesSpecial_17_04.csv,product_code,item_name,category,best_price,,item_price,unit_price,,special_text,promo_text
8,Coles_02_05.csv,product_code,item_name,category,best_price,,item_price,unit_price,,special_text,promo_text
9,Coles_08_05.csv,product_code,item_name,category,best_price,,item_price,unit_price,,special_text,promo_text


### Merging the dataset

In [8]:
KEEP_ORDER = ["sku", "name" , "category", "b_price","b_unit_price", "item_price","item_unit_price","original_price","discount","promotion"]

def project_to_schema(df: pd.DataFrame, cmap: Dict[str, Optional[str]]) -> pd.DataFrame:
    out = pd.DataFrame({k: (df[cmap[k]] if cmap.get(k) else None) for k in KEEP_ORDER})
    return out

#Finding the shape of the dataset after the datasets have been merged
project_frames = []
for fp, raw in loaded.items():
    cmap = detected_maps[fp]
    proj = project_to_schema(raw, cmap)
    proj["__source_file__"] = Path(fp).name
    project_frames.append(proj)

staged = pd.concat(project_frames, ignore_index = True)
print("Staged shape:" , staged.shape)

Staged shape: (231340, 11)


In [9]:
#Making the price more standardized
price_cols = ["b_price","b_unit_price","item_price","item_unit_price","original_price"]

#Function to remove unrequired characters
def to_price(s: pd.Series) -> pd.Series:
    return(
        s.astype(str)
        .str.replace(",","",regex=False)
        .str.extract(r"([-+]?d*\.?\d+)")[0]
        .astype(float)
    )

for c in price_cols:
    if c in staged.columns and staged[c].notna().any():
        try:
            staged[c] = to_price(staged[c])
        except Exception as e:
            print(f"Unable to convert values {c} : {e}")

In [10]:
# If best unit price missing, use the current unit price
mask = staged["b_unit_price"].isna() & staged["item_unit_price"].notna()
staged.loc[mask, "b_unit_price"] = staged.loc[mask, "item_unit_price"]

# Original price
# If missing, but we have item_price and b_price
mask = staged["original_price"].isna() & staged["item_price"].notna()
staged.loc[mask, "original_price"] = staged.loc[mask, "item_price"]

# If item_price is missing but best_price exists, treat original = best (no discount context)
mask = staged["original_price"].isna() & staged["item_price"].isna() & staged["b_price"].notna()
staged.loc[mask, "original_price"] = staged.loc[mask, "b_price"]

In [11]:
coverage = {
    "rows": len(staged),
    "sku_nonnull": int(staged["sku"].notna().sum()),
    "b_unit_price_nonnull": int(staged["b_unit_price"].notna().sum()),
    "original_price_nonnull": int(staged["original_price"].notna().sum()),
}
print("[COVERAGE]", coverage)

# Category sanity and viewing the first 15 categories
print(staged["category"].fillna("UNKNOWN").astype(str).str.strip().value_counts().head(15))

# Saving the dataset into my local directory
output_dir.mkdir(exist_ok=True, parents=True)
staged.to_csv(output_dir / "staged_merged_clean.csv", index=False)
print("[SAVED]", output_dir / "staged_merged_clean.csv")


[COVERAGE] {'rows': 231340, 'sku_nonnull': 231340, 'b_unit_price_nonnull': 224622, 'original_price_nonnull': 231340}
category
HAIR CARE               8096
HEALTH FOODS            6031
COFFEE                  5971
MEDICINAL PRODUCTS      5688
SNACKS                  5436
ASIAN FOODS             5333
CHILLED DESSERTS        5035
BISCUITS & COOKIES      4936
SKIN CARE               4911
VITAMINS                4775
CEREAL                  4535
TEA                     4028
ICE CREAM               3690
COSMETICS/TOILETRIES    3643
SPICES/HERBS            3602
Name: count, dtype: int64
[SAVED] C:\Users\pmayr\Downloads\Output\staged_merged_clean.csv


In [12]:
# Checking all the products in the dataset to ensure there is no duplicates
#To maintain stnadardization, values such as quantitative values (500g, 1L)
def norm_name(s: pd.Series) -> pd.Series:
   
    x = s.astype(str).str.lower()
    x = x.str.replace(r"[^a-z0-9 ]+", " ", regex=True)
    x = x.str.replace(r"\b(\d+(\.\d+)?)(g|kg|ml|l)\b", " ", regex=True)
    x = x.str.replace(r"\bx\s*\d+\b", " ", regex=True)  # x2, x10
    x = x.str.replace(r"\b(pack|pk|btl|bottle|jar|bag|box)\b", " ", regex=True)
    x = x.str.replace(r"\s+", " ", regex=True).str.strip()
    x = x.replace({"nan": np.nan})
    return x

#1. Creating keys for normalized 
staged["name_norm"]   = norm_name(staged["name"])
staged["cat_norm"]    = staged["category"].astype(str).str.strip().str.lower()
staged["subcat_norm"] = np.nan  

# Dropping duplicate values by checking if there are any values that have the same as the current value
if staged["sku"].notna().any():
    d1 = staged.drop_duplicates(subset=["sku"], keep="first")
    d_rest = staged[staged["sku"].isna()]
else:
    d1 = staged.copy()
    d_rest = staged.iloc[0:0]

#2. For rows without SKU number, evaluate by checking name and category
subset_cols = ["name_norm", "cat_norm", "b_price", "item_price", "original_price"]
d2 = d_rest.drop_duplicates(subset=subset_cols, keep="first")

dedup = pd.concat([d1, d2], ignore_index=True)

print(f"[INFO] Deduped from {len(staged)} -> {len(dedup)} rows "
      f"(SKU-based kept {len(d1)}, name+category kept {len(d2)})")

# Saving the dataset without duplicates
dedup.to_csv(output_dir / "staged_dedup.csv", index=False)
print("[SAVED]", output_dir / "staged_dedup.csv")


[INFO] Deduped from 231340 -> 24897 rows (SKU-based kept 24897, name+category kept 0)
[SAVED] C:\Users\pmayr\Downloads\Output\staged_dedup.csv


### Feature Engineering