In [15]:
import pandas as pd
import numpy as np
import re

In [17]:
woolworths = pd.read_csv("Woolworths.csv")

In [19]:
# drop unnecessary columns
woolworths = woolworths.drop(["Special Text", "Link"], axis = 1)

def extract_price(text):
    # The function will remove all the unnecessary texts and retain only numeric values
    try:
        text = str(text)
        match = re.search(r'\$\s?(\d+(?:\.\d+)?)', text)
        return float(match.group(1)) if match else None
    except:
        return None

woolworths["Best Price"] = woolworths["Best Price"].apply(extract_price)
woolworths["Best Unit Price"] = woolworths["Best Unit Price"].apply(extract_price)
woolworths["Item Price"] = woolworths["Item Price"].apply(extract_price)
woolworths["Unit Price"] = woolworths["Unit Price"].apply(extract_price)
woolworths["Price Was"] = woolworths["Price Was"].apply(extract_price)

def calculate_total(row):
    # Extract the quantity from the Complex Promo Text to calculate the total orignial prices for any text values that contain "2 FOR", "3 FOR", "5 FOR" and so on.
    try:
        promo_text = str(row["Complex Promo Text"])
        item_price = float(row["Item Price"])
        
        # Check if promo text starts with something like "2 FOR"
        match = re.match(r"(\d+)\s+FOR", promo_text)
        if match:
            quantity = int(match.group(1))
            return round(quantity * item_price, 2)
        else:
            return round(item_price, 2)
    except:
        return round(float(row["Item Price"]), 2) if not pd.isna(row["Item Price"]) else np.nan

woolworths["Original Price"] = woolworths.apply(calculate_total, axis = 1)

def extract_discount_price(text):
    # Extract the discount price
    try:
        text = str(text)
        # Match pattern like "FOR $14.00"
        match = re.search(r'FOR\s+\$(\d+(?:\.\d+)?)', text)
        return float(match.group(1)) if match else None
    except:
        return None

woolworths["Discount Price"] = woolworths["Complex Promo Text"].apply(extract_discount_price)
woolworths["Complex Promo Text"] = woolworths["Complex Promo Text"].apply(extract_price)

def update_original_price(row):
    # Update the original price of the item based on Complex Promo Text and Price Was columns
    price_was = row["Price Was"]
    promo_val = row["Complex Promo Text"]
    discount_price = row["Discount Price"]

    if pd.isna(discount_price):
        # If promo value is available and valid, use it
        if pd.notna(promo_val):
            return promo_val
        # If not, but price_was is available, use that
        elif pd.notna(price_was):
            return price_was

    # Else, keep original
    return row["Original Price"]

def update_discount_price(row):
    # Update the discount price based on the Discount Price and Item Price
    discount_price = row["Discount Price"]
    item_price = row["Item Price"]

    if pd.isna(discount_price):
        return item_price
    return discount_price

woolworths["Original Price"] = woolworths.apply(update_original_price, axis=1)
woolworths["Discount Price"] = woolworths.apply(update_discount_price, axis=1)

woolworths = woolworths.drop(["Price Was", "Complex Promo Text"], axis = 1)

woolworths.to_csv("woolworths_cleaned.csv", index=False)