In [1]:
# Data Ingestion & Cleaning
import pandas as pd
import numpy as np
import glob
import os
import re

## Data Ingestion & Cleaning
<b> Main Functions: </b>
- merge_product_files(data_folder) → Reads multiple CSVs, merges them, removes duplicates.
- clean_products(df) → Cleans raw product info, extracts unit price, standardizes base unit price, and computes discounts.

In [2]:
def merge_product_files(data_folder='data'):
    """
    Merges all CSV files in the specified folder into a single DataFrame
    
    Args:
        data_folder (str): Path to folder containing CSV files
        
    Returns:
        pd.DataFrame: Combined product data with source file tracking
    """
    # Find all CSV files in the folder
    all_files = glob.glob(os.path.join(data_folder, "*.csv"))
    
    # Read and concatenate files
    dfs = []
    for file in all_files:
        df = pd.read_csv(file)
        df['source_file'] = os.path.basename(file)  # Track origin
        dfs.append(df)
    
    # Combine with duplicate handling
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Remove exact duplicates (same data from multiple files)
    combined_df.drop_duplicates(
        subset=['product_code'],  # Assuming this is your unique ID
        keep='first',
        inplace=True
    )
    
    return combined_df

In [3]:
def clean_products(df):
    # Drop unnecessary columns
    df = df.drop(columns=['special_text', 'promo_text', 'link'], errors='ignore')
    
    # Remove duplicates
    df.drop_duplicates(subset=['product_code'], inplace=True)

    # Convert and filter prices
    for col in ['best_price', 'item_price']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df[df['item_price'].notna() & (df['item_price'] > 0)]

    # Drop null unit_price rows before extraction
    df.dropna(subset=['unit_price'], inplace=True)

    # Improved extraction with consistent return format
    def extract_price_unit(text):
        if pd.isna(text): 
            return (np.nan, np.nan)  # Return NaN instead of None
        text = str(text)
        price = re.search(r'\$([\d\.]+)', text)
        unit = re.search(r'per\s*(.*)', text, re.IGNORECASE)
        return (
            float(price.group(1)) if (price and price.group(1)) else np.nan,
            unit.group(1).strip().lower() if unit else np.nan
        )

    # Apply extraction safely
    price_unit_data = df['unit_price'].apply(extract_price_unit).tolist()
    df[['unit_price_value', 'unit_of_measure']] = pd.DataFrame(
        price_unit_data, 
        index=df.index
    )

    # Standardize price per 100g/ml
    def base_price(row):
        u = str(row['unit_of_measure']).lower() if pd.notna(row['unit_of_measure']) else ''
        p = row['unit_price_value']
        if pd.isna(p): return np.nan
        if "kg" in u: return p / 10
        if "g" in u: return p
        if "l" in u or "litre" in u: return p / 10
        if "ml" in u: return p
        if "ea" in u or "each" in u: return p
        return p

    df['base_unit_price'] = df.apply(base_price, axis=1)
    df['discount_percentage'] = np.where(
        df['item_price'] > df['best_price'],
        (df['item_price'] - df['best_price']) / df['item_price'],
        0
    )

    return df.drop(columns=['unit_price'])

In [14]:
folder_path = 'data/scrapped_data'
products_df = merge_product_files(folder_path)   

In [15]:
products_df.columns

Index(['Unnamed: 0', '_id', 'product_code', 'category', 'item_name',
       'best_price', 'item_price', 'source_file', 'unit_price_value',
       'unit_of_measure', 'base_unit_price', 'discount_percentage',
       'unit_price', 'special_text', 'promo_text', 'link'],
      dtype='object')

In [16]:
products_df = clean_products(products_df)

In [20]:
products_df = products_df.drop(columns=['Unnamed: 0'])

In [21]:
# check null values
products_df.isna().sum()

_id                    0
product_code           0
category               0
item_name              0
best_price             0
item_price             0
source_file            0
unit_price_value       0
unit_of_measure        0
base_unit_price        0
discount_percentage    0
dtype: int64

In [22]:
# save products to csv 
products_df.to_csv(folder_path + '/products.csv')

In [23]:
products_df.head()

Unnamed: 0,_id,product_code,category,item_name,best_price,item_price,source_file,unit_price_value,unit_of_measure,base_unit_price,discount_percentage
90756,6825a3c58951f0bfbd089c70,2488612,PET FOOD,Beef & Rice Wet Dog Food Can Adult,2.8,3.45,ScrappedData.2025_05_15_182013_Coles_All.csv,0.7,100g,0.7,0.188406
90772,6825a3c58951f0bfbd089c80,6132059,PET FOOD,Shredded Meals Chicken Dog Food 6x100g,8.4,10.5,ScrappedData.2025_05_15_182013_Coles_All.csv,1.4,100g,1.4,0.2
107041,6825a3c58951f0bfbd08dc0d,1057036,WATER,Sparkling Water Strawberry 10x375mL,9.5,19.0,ScrappedData.2025_05_15_182013_Coles_All.csv,2.53,1l,0.253,0.5
107042,6825a3c58951f0bfbd08dc0e,1057069,WATER,Sparkling Natural Water 375ml,9.5,19.0,ScrappedData.2025_05_15_182013_Coles_All.csv,2.53,1l,0.253,0.5
107533,6825a3c58951f0bfbd08ddf9,1051776,WATER,Sparkling Water Cans Pineapple 250mL,6.0,10.0,ScrappedData.2025_05_15_182013_Coles_All.csv,4.0,1l,0.4,0.4
