In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
import pandas as pd

# Load the dataset with the quality score
file_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\updated_with_quality_score.csv"
df = pd.read_csv(file_path)

# Encode brand and sub-category
df['brand_encoded'] = LabelEncoder().fit_transform(df['brand_name'])
df['subcat_encoded'] = LabelEncoder().fit_transform(df['subcat'])

# Define features for clustering
features = df[['unit_price', 'brand_encoded', 'subcat_encoded', 'weight_grams', 'quality_score']]

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=10, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)

# Save the dataset with clustering
output_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\clustered_with_quality.csv"
df.to_csv(output_path, index=False)

print(f"✅ Clustering complete and dataset saved at: {output_path}")


In [None]:
import pandas as pd

# Load the clustered dataset
file_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\clustered_with_quality.csv"
df = pd.read_csv(file_path)

# Define weight tolerance (10% range)
WEIGHT_TOLERANCE = 0.1

# Create a list to store substitution suggestions
substitution_rows = []

# Loop through each product to find potential substitutes
for idx, row in df.iterrows():
    original_item = row['item_name']
    original_price = row['unit_price']
    original_weight = row['weight_grams']
    original_quality = row['quality_score']
    original_cluster = row['cluster']

    # Define acceptable weight range
    weight_min = original_weight * (1 - WEIGHT_TOLERANCE)
    weight_max = original_weight * (1 + WEIGHT_TOLERANCE)

    # Identify potential substitutes
    candidates = df[
        (df['cluster'] == original_cluster) &
        (df['item_name'] != original_item) &
        (df['unit_price'] < original_price) &
        (df['weight_grams'] >= weight_min) &
        (df['weight_grams'] <= weight_max) &
        (df['quality_score'] >= original_quality)
    ]

    # Select top 3 cheapest substitutes
    candidates = candidates.sort_values(by='unit_price').head(3)

    # Add suggestions to the list
    for _, candidate in candidates.iterrows():
        substitution_rows.append({
            'Original Item': original_item,
            'Original Price': original_price,
            'Original Weight': original_weight,
            'Original Quality': original_quality,
            'Suggested Item': candidate['item_name'],
            'Suggested Price': candidate['unit_price'],
            'Suggested Weight': candidate['weight_grams'],
            'Suggested Quality': candidate['quality_score'],
            'Price Difference': round(original_price - candidate['unit_price'], 2)
        })

# Create the substitution DataFrame
substitution_df = pd.DataFrame(substitution_rows)

# Save the substitution dataset
output_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\smart_substitutions_with_quality.csv"
substitution_df.to_csv(output_path, index=False)

print(f"✅ Substitution dataset saved at: {output_path}")


In [None]:
import pandas as pd
import random
import uuid

# Load the clustered dataset
file_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\clustered_with_quality_v2.csv"
df = pd.read_csv(file_path)

# Define parameters
NUM_CARTS = 100  # Number of simulated carts
ITEMS_PER_CART = 5  # Items per cart

# Create a list to store the simulated carts
simulated_carts = []

# Generate random carts
for _ in range(NUM_CARTS):
    cart_id = f"CART-{uuid.uuid4().hex[:6]}"
    cart_items = df.sample(ITEMS_PER_CART)

    for _, item in cart_items.iterrows():
        simulated_carts.append({
            "Cart_ID": cart_id,
            "Item_ID": item["item_name"],
            "Price": item["unit_price"],
            "Weight (grams)": item["weight_grams"],
            "Quality Score": item["quality_score"],
            "Sub-category": item["subcat"],
            "Cluster": item["cluster"]
        })

# Convert to DataFrame
carts_df = pd.DataFrame(simulated_carts)

# Save the simulated carts dataset
output_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\simulated_carts.csv"
carts_df.to_csv(output_path, index=False)

print(f"✅ Simulated carts dataset saved at: {output_path}")


In [None]:
import pandas as pd

# Load the datasets
carts_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\simulated_carts.csv"
subs_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\smart_substitutions_with_quality.csv"

carts_df = pd.read_csv(carts_path)
subs_df = pd.read_csv(subs_path)

# Initialize savings data
savings_data = []

# Loop through each cart
for cart_id in carts_df['Cart_ID'].unique():
    cart_items = carts_df[carts_df['Cart_ID'] == cart_id]
    total_original_price = cart_items['Price'].sum()
    total_savings = 0

    # Check each item in the cart for potential substitutions
    for _, item in cart_items.iterrows():
        item_name = item['Item_ID']
        original_price = item['Price']

        # Get potential substitutions for the item
        substitutes = subs_df[subs_df['Original Item'] == item_name]

        if not substitutes.empty:
            # Find the best substitute (highest price difference)
            best_sub = substitutes.loc[substitutes['Price Difference'].idxmax()]
            total_savings += best_sub['Price Difference']

    # Calculate the final cart total after applying substitutions
    final_price = total_original_price - total_savings

    # Add the cart savings data
    savings_data.append({
        "Cart_ID": cart_id,
        "Original Total": round(total_original_price, 2),
        "Final Total": round(final_price, 2),
        "Total Savings": round(total_savings, 2)
    })

# Create the savings summary DataFrame
savings_df = pd.DataFrame(savings_data)

# Save the savings summary dataset
savings_output_path = r"C:\Users\Mahsa\OneDrive\Documents\T1-2025\capstone project\sampledatafromgithub\coles-data\cart_savings_summary.csv"
savings_df.to_csv(savings_output_path, index=False)

print(f"✅ Cart savings summary saved at: {savings_output_path}")
