In [13]:
import pandas as pd
import random

df = pd.read_csv(r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\5 - Processed Data Files\4) Gathered Data\refined_characteristics - refined_characteristics.csv")

In [14]:
def weighted_random_selection(options, probabilities, max_select):

    # Do one coin toss per candidate and record successes along with their p.
    selected = []
    for opt, p in zip(options, probabilities):
        if random.random() < p:
            selected.append((opt, p))
    # If the number of successes does not exceed max_select, we're done.
    if len(selected) <= max_select:
        return [opt for opt, p in selected]
    
    # Otherwise, drop candidates until only max_select remain.
    while len(selected) > max_select:
        # Compute total drop weight over the selected candidates.
        total_drop_weight = sum((1 - p) for _, p in selected)
        r = random.random() * total_drop_weight
        cumulative = 0
        # Find a candidate to drop.
        for i, (opt, p) in enumerate(selected):
            cumulative += (1 - p)
            if cumulative >= r:
                selected.pop(i)
                break
                
    return [opt for opt, p in selected]

def weighted_random_selection(options, probabilities, max_select, current, opposites):
    """
    Selects options based on their probabilities, with special handling for groups of opposites.
    
    Parameters:
        options (list): Candidate tags/flavours.
        probabilities (list): Corresponding probabilities for each candidate.
        max_select (int): Maximum number of items to select.
        current (set): Already-selected items (used to avoid conflicts).
        opposites (dict): Mapping of each tag/flavour to a list of its opposites.
    
    Returns:
        list: The selected options.
    """
    # -------------------------------
    # 1. Filter out options that conflict with already-selected items.
    #    If an option has any opposites already in 'current', skip it.
    filt_opts = []
    filt_probs = []
    for opt, p in zip(options, probabilities):
        if any(opp in current for opp in opposites.get(opt, [])):
            # Skip this option because an opposite is already selected.
            continue
        filt_opts.append(opt)
        filt_probs.append(p)
    
    # -------------------------------
    # 2. Group the filtered options into clusters based on opposites.
    #    Two options belong to the same group if one is the opposite of the other (in either direction).
    groups = []
    used = set()  # To keep track of options that have been grouped already.
    for opt in filt_opts:
        if opt in used:
            continue
        group = {opt}  # Start a new group with the current option.
        # Check all other options for oppositeness.
        for other in filt_opts:
            if other in group:
                continue
            # If either 'other' is an opposite of 'opt' or vice versa, add it to the group.
            if other in opposites.get(opt, []) or opt in opposites.get(other, []):
                group.add(other)
        groups.append(list(group))
        used.update(group)
    
    # -------------------------------
    # 3. Process each group to select candidates.
    #    - If the sum of probabilities in the group is > 1, force one candidate to be selected.
    #      The chance for each candidate is proportional to its probability relative to the group's total.
    #    - Otherwise, process candidates normally: randomize their order and perform a coin toss
    #      with each candidate's probability until one is selected.
    selected = []
    for group in groups:
        # Get each candidate's probability from filt_probs.
        group_probs = [filt_probs[filt_opts.index(opt)] for opt in group]
        total = sum(group_probs)
        if total > 1:
            # Force selection: one candidate must be chosen.
            # Generate a random number between 0 and the total probability.
            r = random.random() * total
            cumulative = 0
            for opt in group:
                p = filt_probs[filt_opts.index(opt)]
                cumulative += p
                if cumulative >= r:
                    selected.append(opt)
                    break
        else:
            # Normal selection: randomize order and perform coin tosses.
            group_order = group[:]  # Copy the group list.
            random.shuffle(group_order)
            chosen = None
            for opt in group_order:
                p = filt_probs[filt_opts.index(opt)]
                if random.random() < p:
                    chosen = opt
                    break  # Stop at the first successful candidate.
            if chosen is not None:
                selected.append(chosen)
    
    # -------------------------------
    # 4. If the number of selected candidates exceeds max_select,
    #    remove extras using weighted removal.
    #    Options with lower probability (i.e., higher 1-p value) are more likely to be dropped.
    if len(selected) > max_select:
        while len(selected) > max_select:
            total_drop = sum((1 - filt_probs[filt_opts.index(opt)]) for opt in selected)
            r = random.random() * total_drop
            cumulative = 0
            for opt in selected:
                p = filt_probs[filt_opts.index(opt)]
                cumulative += (1 - p)
                if cumulative >= r:
                    selected.remove(opt)
                    break
    
    return selected



In [15]:
# Define category tags that will be assigned to each L1 item
category_tags = [
    "Fresh", "Frozen food", "Health food or drink", "Unhealthy option",
    "Low calorie alternative", "Just cook food", "Grab & go prepared food or drink",
    "Alcohol free alternative", "Decaf alternative", "Contains meat",
    "Contains seafood", "Animal produce", "Vegetarian or vegan alternative",
    "Organic or additive free", "Halal", "Kosher", "Special event or occasion",
    "British cuisine", "American cuisine", "French cuisine", "Italian cuisine",
    "Mediterranean cuisine", "Middle Eastern cuisine", "Germanic and Slavic cuisine",
    "Chinese cuisine", "Japanese cuisine", "Korean cuisine", "Southeast Asian cuisine",
    "South Asian cuisine", "Mexican cuisine", "Caribbean cuisine",
    "Sub-Saharan African cuisine", "Northern African cuisine"
]

# Define flavour tags that will be assigned to each L1 item
flavour_tags = [
    "Bitter", "Neutral", "Non food", "Salty", "Savoury", "Sour", "Spicy", "Sweet", "Umami"
]

# Default rating distribution used for assigning probability-based tags and flavours
default_rating_distribution = {
    "Never": 0.0,
    "Almost never": 0.05, 
    "Rarely": 0.35,
    "Usually": 0.65,
    "Consistently": 0.95,
    "Always": 1.0
}

# Custom probability distributions for specific tags/flavours (if applicable)
bespoke_distributions = {
        "Special evfent or occasion": {
        "Never": 1,
        "Almost never": 0.95,
        "Rarely": 0,
        "Usually": 1,
        "Consistently": 1,
        "Always": 1.0
    }
}

# Define opposite tag rules - tags that cannot coexist in the same row
tag_opposites = {  
    "Fresh": ["Frozen food", "Non food"],
    "Frozen food": ["Fresh", "Non food"],
    "Health food or drink": ["Unhealthy option", "Non food"],
    "Unhealthy option": ["Health food or drink", "Organic or additive free", "Non food"],
    "Organic or additive free": ["Non food", "Unhealthy option"],
    "Low calorie alternative": ["Non food"],
    "Alcohol free alternative": ["Non food"],
    "Decaf alternative": ["Non food"],
    "Just cook food": ["Grab & go prepared food or drink", "Non food"],
    "Grab & go prepared food or drink": ["Just cook food", "Non food"],
    "Contains meat": ["Vegetarian or vegan alternative", "Non food", "Animal produce"],
    "Contains seafood": ["Vegetarian or vegan alternative", "Non food", "Animal produce"],
    "Animal produce": ["Vegetarian or vegan alternative", "Non food", "Contains seafood", "Contains meat"],
    "Vegetarian or vegan alternative": ["Contains meat", "Contains seafood", "Animal produce", "Non food"],
    "Halal": ["Non food"],
    "Kosher": ["Non food"],
    "British cuisine": ["Non food"],
    "Bitter": ["Neutral", "Non food"],
    "Neutral": ["Bitter", "Salty", "Savoury", "Sour", "Spicy", "Sweet", "Umami", "Non food"],
    "Salty": ["Neutral", "Non food"],
    "Savoury": ["Neutral", "Non food"],
    "Sour": ["Neutral", "Non food"],
    "Spicy": ["Neutral", "Non food"],
    "Sweet": ["Neutral", "Non food"],
    "Umami": ["Neutral", "Non food"]
}

In [16]:
# Store all generated rows
output_rows = []

# Iterate over each L1 item in the dataset
for index, row in df.iterrows():
    # Determine the number of rows for the L1 item (7 if "Other" in L1, else 20)
    num_rows = 10 if "Other" in str(row["L1"]) else 30  
    
    # Initialize probability tracking for this L1 item
    tag_probs = {}
    flavour_probs = {}
    must_every_row_tags = set()
    must_once_tags = set()
    must_every_row_flavours = set()
    must_once_flavours = set()
    half_tags = {}
    half_flavours = {}
    
    # Process category tags for this L1 item
    for tag in category_tags:
        rating = row[tag] if tag in row else "Never"
        distribution = bespoke_distributions.get(tag, default_rating_distribution)
        probability = distribution.get(rating, 0)
        tag_probs[tag] = probability
        if probability == 1.0:
            must_every_row_tags.add(tag)
        elif probability > 0:
            must_once_tags.add(tag)
    
    # Process flavour tags for this L1 item
    for flavour in flavour_tags:
        rating = row[flavour] if flavour in row else "Never"
        distribution = bespoke_distributions.get(flavour, default_rating_distribution)
        probability = distribution.get(rating, 0)
        flavour_probs[flavour] = probability
        if probability == 1.0:
            must_every_row_flavours.add(flavour)
        elif probability > 0:
            must_once_flavours.add(flavour)
    
    # Identify and evenly distribute "Half" tags
    half_tag_list = [t for t in category_tags if row[t] == "Half"]
    if half_tag_list:
        half_tag_distribution = num_rows // len(half_tag_list)
        for tag in half_tag_list:
            half_tags[tag] = half_tag_distribution
    
    # Identify and evenly distribute "Half" flavours
    half_flavour_list = [f for f in flavour_tags if row[f] == "Half"]
    if half_flavour_list:
        half_flavour_distribution = num_rows // len(half_flavour_list)
        for flavour in half_flavour_list:
            half_flavours[flavour] = half_flavour_distribution
    
    # Generate rows for this L1 item
    for _ in range(num_rows):
        row_tags = list(must_every_row_tags)
        row_flavours = list(must_every_row_flavours)

        # Assign a "Half" tag only if it doesn't conflict with any Always tag.
        if half_tags:
            valid_half_tags = [
                tag for tag in half_tags
                if not any(
                    tag in tag_opposites.get(always_tag, []) or always_tag in tag_opposites.get(tag, [])
                    for always_tag in must_every_row_tags
                )
            ]
            if valid_half_tags:
                selected_half_tag = random.choice(valid_half_tags)
                row_tags.append(selected_half_tag)
                half_tags[selected_half_tag] -= 1
                if half_tags[selected_half_tag] == 0:
                    del half_tags[selected_half_tag]

        # Assign a "Half" flavour only if it doesn't conflict with any Always flavour.
        if half_flavours:
            valid_half_flavours = [
                flavour for flavour in half_flavours
                if not any(
                    flavour in tag_opposites.get(always_flavour, []) or always_flavour in tag_opposites.get(flavour, [])
                    for always_flavour in must_every_row_flavours
                )
            ]
            if valid_half_flavours:
                selected_half_flavour = random.choice(valid_half_flavours)
                row_flavours.append(selected_half_flavour)
                half_flavours[selected_half_flavour] -= 1
                if half_flavours[selected_half_flavour] == 0:
                    del half_flavours[selected_half_flavour]
        
        # Shuffle the tag and flavour order for each row to ensure randomization of opposites
        random.shuffle(category_tags)
        random.shuffle(flavour_tags)

        # Assign a must_once tag only if it doesn't conflict with any always tag.
        if must_once_tags:
            # Get a candidate must_once tag.
            candidate = must_once_tags.pop()
            # Check if it conflicts with any always tag.
            if not any(candidate in tag_opposites.get(always, []) or always in tag_opposites.get(candidate, [])
                    for always in must_every_row_tags):
                row_tags.append(candidate)
        
        # Assign a must_once flavour only if it doesn't conflict with any always flavour.
        if must_once_flavours:
            candidate = must_once_flavours.pop()
            if not any(candidate in tag_opposites.get(always, []) or always in tag_opposites.get(candidate, [])
                    for always in must_every_row_flavours):
                row_flavours.append(candidate)

        
        # Assign additional probabilistic tags while ensuring opposites do not coexist
        available_tags = [
            tag for tag in category_tags 
            if tag in tag_probs and tag_probs[tag] > 0 and tag not in row_tags 
            # and not any(opposite in row_tags for opposite in tag_opposites.get(tag, []))
        ]
        num_tags = random.randint(1, min(5, len(available_tags))) if available_tags else 0
        tag_probabilities = [tag_probs[tag] for tag in available_tags]
        # row_tags.extend(weighted_random_selection(available_tags, tag_probabilities, num_tags))
        row_tags.extend(weighted_random_selection(available_tags, tag_probabilities, num_tags, set(row_tags), tag_opposites))
        
        # Assign additional probabilistic flavours while ensuring opposites do not coexist
        available_flavours = [
            flavour for flavour in flavour_tags 
            if flavour in flavour_probs and flavour_probs[flavour] > 0 and flavour not in row_flavours 
            # and not any(opposite in row_flavours for opposite in tag_opposites.get(flavour, []))
        ]
        num_flavours = random.randint(1, len(available_flavours)) if available_flavours else 0
        flavour_probabilities = [flavour_probs[flavour] for flavour in available_flavours]
        # row_flavours.extend(weighted_random_selection(available_flavours, flavour_probabilities, num_flavours))
        row_flavours.extend(weighted_random_selection(available_flavours, flavour_probabilities, num_flavours, set(row_flavours), tag_opposites))
                
        tag_columns = {f"Tag{i+1}": row_tags[i] if i < len(row_tags) else None for i in range(5)}
        flavour_columns = {f"Flav{i+1}": row_flavours[i] if i < len(row_flavours) else None for i in range(5)}
        
        output_rows.append({"L1": row["L1"], **tag_columns, **flavour_columns})

In [17]:
df_output = pd.DataFrame(output_rows)

# function to remove duplicates from within a row
def remove_row_duplicates(row):
    seen = set()
    new_values = []
    for item in row:
        if item in seen:
            new_values.append(None)  # remove duplicate by replacing with None
        else:
            seen.add(item)
            new_values.append(item)
    return pd.Series(new_values, index=row.index)

df_output= df_output.apply(remove_row_duplicates, axis=1)

In [18]:
def resolve_conflicts_in_row(row, columns, opposites, weights):
    # Extract items from the specified columns
    items = [row[col] for col in columns if pd.notnull(row[col]) and row[col] != "None"]
    resolved = items.copy()
    conflict_exists = True
    while conflict_exists:
        conflict_exists = False
        n = len(resolved)
        # Iterate over each unique pair
        for i in range(n):
            for j in range(i + 1, n):
                a = resolved[i]
                b = resolved[j]
                # Check if a and b are opposites
                if (b in opposites.get(a, [])) or (a in opposites.get(b, [])):
                    conflict_exists = True
                    weight_a = weights.get(a, 0)
                    weight_b = weights.get(b, 0)
                    # Remove the one with the lower weight;
                    # if equal, remove one at random.
                    if weight_a < weight_b:
                        resolved.pop(i)
                    elif weight_a > weight_b:
                        resolved.pop(j)
                    else:
                        if random.random() < 0.5:
                            resolved.pop(i)
                        else:
                            resolved.pop(j)
                    # Break immediately after a removal to restart checking.
                    break
            if conflict_exists:
                break
        # Update n automatically on next loop iteration if a removal occurred.
    # Rebuild the row: fill remaining columns with None if needed.
    new_vals = resolved + [None] * (len(columns) - len(resolved))
    for col, val in zip(columns, new_vals):
        row[col] = val
    return row

# Example usage for tags:
# (Assuming tag_probs is a dictionary mapping each tag to its weight.)
df_output = df_output.apply(lambda row: resolve_conflicts_in_row(
    row, [f"Tag{i}" for i in range(1, 6)], tag_opposites, tag_probs), axis=1)

# And for flavours (using flavour_probs as the weight dictionary):
df_output = df_output.apply(lambda row: resolve_conflicts_in_row(
    row, [f"Flav{i}" for i in range(1, 6)], tag_opposites, flavour_probs), axis=1)

df_output["Generated products"] = None

In [19]:
### Following functions are just for testing

In [20]:
def print_conflicts(df, columns, opposites):
    """
    For each row in df, print any conflict pairs found among the given columns.
    A conflict is when both an item and one of its opposites are present.
    """
    for idx, row in df.iterrows():
        # Extract non-null, non-"None" values from the specified columns.
        items = [row[col] for col in columns if pd.notnull(row[col]) and row[col] != "None"]
        conflicts = []
        # Check each pair for oppositeness.
        for i in range(len(items)):
            for j in range(i + 1, len(items)):
                a, b = items[i], items[j]
                if (b in opposites.get(a, [])) or (a in opposites.get(b, [])):
                    conflicts.append((a, b))
        if conflicts:
            print(f"Row {idx} conflicts: {conflicts}")

# Example usage for tags:
tag_columns = [f"Tag{i}" for i in range(1, 6)]
print_conflicts(df_output, tag_columns, tag_opposites)

# And for flavours:
flav_columns = [f"Flav{i}" for i in range(1, 6)]
print_conflicts(df_output, flav_columns, tag_opposites)


In [21]:
def percent_rows_with_word(df, word):
    # Convert all cells to string and check for the presence of the word (case-insensitive)
    contains_word = df.astype(str).apply(lambda col: col.str.contains(word, case=False, na=False))
    # Identify rows where any column contains the word
    rows_with_word = contains_word.any(axis=1)
    # Return the percentage of rows that contain the word
    return rows_with_word.mean() * 100

df_filtered = df_output[df_output["L1"] == "Ale, stout & bitter"]

percent_rows_with_word(df_filtered, "Umami")

20.0

In [22]:
# Map in the levels
mapping_df = pd.read_csv(r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\5 - Processed Data Files\4) Gathered Data\refined_characteristics - refined_characteristics.csv")
mapping_df = mapping_df[["L3", "L2", "L1"]]

# Merge them to get all levels for item cats
merged_df = pd.merge(mapping_df, df_output, left_on="L1", right_on="L1")

In [23]:
# Block to sort the df into groups, then show all 1s first etc - this is for later batch processing

# Group the DataFrame by 'L1' and reset the index for each group
groups = {group: grp.reset_index(drop=True) for group, grp in merged_df.groupby('L1')}

# Determine the maximum number of rows in any group
max_count = max(len(grp) for grp in groups.values())

# List to collect rows in the desired interleaved order
interleaved_rows = []

# Loop over the row indices. For each index (i), add one row from each group (if available),
# and assign the group number (i+1) to each.
for i in range(max_count):
    for group in sorted(groups.keys()):
        grp = groups[group]
        if i < len(grp):
            row = grp.iloc[i].copy()  # create a copy so that we can modify it
            row["group"] = i + 1      # assign the batch number as the group
            interleaved_rows.append(row)

# Create a new DataFrame from the interleaved rows
merged_df = pd.DataFrame(interleaved_rows)

# Optionally, reset the index if needed
merged_df.reset_index(drop=True, inplace=True)

In [24]:
merged_df

Unnamed: 0,L3,L2,L1,Tag1,Tag2,Tag3,Tag4,Tag5,Flav1,Flav2,Flav3,Flav4,Flav5,Generated products,group
0,Personal care,Wellbeing,Adult devices & sex,,,,,,Non food,,,,,,1
1,Personal care,Scents & deodorants,Aftershave & perfumes,,,,,,Non food,,,,,,1
2,Alcohol,Cocktails,Alcopops,Organic or additive free,Alcohol free alternative,Low calorie alternative,,,Spicy,Sour,Sweet,,,,1
3,Alcohol,Beer & cider,"Ale, stout & bitter",British cuisine,Organic or additive free,Alcohol free alternative,,,Bitter,Spicy,Umami,Sweet,,,1
4,Personal care,Medicine,Allergies,Health food or drink,,,,,Non food,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10965,Home,Home maintenance,White goods,,,,,,Non food,,,,,,30
10966,Alcohol,Wine,White wine,Italian cuisine,French cuisine,American cuisine,Mediterranean cuisine,British cuisine,Sweet,,,,,,30
10967,Bakery,"Wraps, naans & pittas",Wraps,Mexican cuisine,Caribbean cuisine,Unhealthy option,Low calorie alternative,,Savoury,Salty,Umami,,,,30
10968,Pantry staples,Spreads,Yeast extracts & savoury spreads,Organic or additive free,,,,,Umami,Savoury,Salty,,,,30


In [25]:
merged_df.to_csv(r"G:\My Drive\Wantrepreneurialism\Active\spend-analytics\Tesco Clubcards\5 - Processed Data Files\4) Gathered Data\synthetic_categories.csv", index=False)