In [None]:
# Count tag uses to ensure all equally used
# 20 examples for each cat
# Only 7 for other categories
# Make premim relate to some of the shop wordings like taste the difference
# Ensure synthetics don't produce things that belong to other categories, like a pastry or burger into ready meals
# Set the % i want tags to set - for example usually rarely is 30%, but for organic, it is 15% maybe - maybe best to set what each tag means for a % for each tag type
    # Special occasions 5%
# If too many things are 30% for example, adjust the allocations to make rare:never:always etc a ratio (or maybe do this anyway)
# Half is when it should be always one of them - not necessarily 50/50
# Herbs & spices british cuisine etc should only be if it is a spices kit etc
# If flag the opposite tags - if something is "Consistently" or "Almost Never" for one, ensure the ones which don't have that tag show the opposite tag?

In [None]:
tag_opposites = {
    # Freshness
    "Fresh": ["Frozen food", "Non food"],
    "Frozen food": ["Fresh", "Non food"],
    
    # Health Orientation
    "Health food or drink": ["Unhealthy option", "Non food"],
    "Unhealthy option": ["Health food or drink", "Organic or additive free", "Non food"],
    "Organic or additive free": ["Non food", "Unhealthy option"],
    
    # Alternatives with no inherent opposites (besides Non food)
    "Low calorie alternative": ["Non food"],
    "Alcohol free alternative": ["Non food"],
    "Decaf alternative": ["Non food"],
    
    # Preparation Style
    "Just cook food": ["Grab & go prepared food or drink", "Non food"],
    "Grab & go prepared food or drink": ["Just cook food", "Non food"],
    
    # Animal Content vs. Vegetarian/Vegan
    "Contains meat": ["Vegetarian or vegan alternative", "Non food", "Animal produce"],
    "Contains seafood": ["Vegetarian or vegan alternative", "Non food", "Animal produce"],
    "Animal produce": ["Vegetarian or vegan alternative", "Non food", "Contains seafood", "Contains meat"],
    "Vegetarian or vegan alternative": ["Contains meat", "Contains seafood", "Animal produce", "Non food"],
    
    # Other qualifiers without defined opposites (besides Non food)
    "Halal": ["Non food"],
    "Kosher": ["Non food"],

    # Special event can go with anything
    "Special event or occasion": [],
    
    # Cuisines (no intrinsic opposites except Non food)
    "British cuisine": ["Non food"],
    "American cuisine": ["Non food"],
    "French cuisine": ["Non food"],
    "Italian cuisine": ["Non food"],
    "Mediterranean cuisine": ["Non food"],
    "Middle Eastern cuisine": ["Non food"],
    "Germanic and Slavic cuisine": ["Non food"],
    "Chinese cuisine": ["Non food"],
    "Japanese cuisine": ["Non food"],
    "Korean cuisine": ["Non food"],
    "Southeast Asian cuisine": ["Non food"],
    "South Asian cuisine": ["Non food"],
    "Mexican cuisine": ["Non food"],
    "Caribbean cuisine": ["Non food"],
    "Sub-Saharan African cuisine": ["Non food"],
    "Northern African cuisine": ["Non food"],
    
    # Taste Descriptors
    "Bitter": ["Neutral", "Non food"],
    "Neutral": ["Bitter", "Salty", "Savoury", "Sour", "Spicy", "Sweet", "Umami", "Non food"],
    "Salty": ["Neutral", "Non food"],
    "Savoury": ["Neutral", "Non food"],
    "Sour": ["Neutral", "Non food"],
    "Spicy": ["Neutral", "Non food"],
    "Sweet": ["Neutral", "Non food"],
    "Umami": ["Neutral", "Non food"],
    
    # Non food (opposite to every other category)
    "Non food": [
        "Fresh", "Frozen food", "Health food or drink", "Unhealthy option",
        "Low calorie alternative", "Alcohol free alternative", "Decaf alternative",
        "Just cook food", "Grab & go prepared food or drink", "Contains meat",
        "Contains seafood", "Animal produce", "Vegetarian or vegan alternative",
        "Organic or additive free", "Halal", "Kosher",
        "British cuisine", "American cuisine", "French cuisine", "Italian cuisine",
        "Mediterranean cuisine", "Middle Eastern cuisine", "Germanic and Slavic cuisine",
        "Chinese cuisine", "Japanese cuisine", "Korean cuisine", "Southeast Asian cuisine",
        "South Asian cuisine", "Mexican cuisine", "Caribbean cuisine",
        "Sub-Saharan African cuisine", "Northern African cuisine",
    ]
}


In [None]:
rating_distribution = {"Never": 0.0, "Almost Never": 0.05, "Rarely": 0.35, "Usually": 0.65, "Consistently": 0.95, "Always": 1.0}

In [None]:
# Function to fetch the correct definition for a characteristic
def get_characteristic_definition(characteristic):
    # Dictionary mapping characteristics to their definitions
    characteristic_definitions = {
        "Fresh": "Items that are raw or minimally processed (produce, raw meat, etc.)",
        "Frozen food": "Preserved by freezing (vegetables, ready meals, ice cream)",
        "Just cook food": "Pre-prepared foods needing minimal effort to get ready for eating (heat to eat etc.)",
        "Grab & go prepared food or drink": "Packaged for convenient consumption away from home (tinned iced coffee, sandwiches etc.)",
        "Animal produce": "Product fully or partly made by an animal (milk, eggs, honey)",
        "Contains meat": "Any kind of meat (poultry, beef, pork, lamb, etc.)",
        "Contains seafood": "Fish, shellfish, or other marine-based protein",
        "Vegetarian or vegan alternative": "Explicitly meat, seafood or animal produce-free version of a product, like soy chicken burgers or oat milk. Not including products which have no animal product version, like carrots or apples. Can apply to meat items.",
        "Organic or additive free": "Produced using natural farming methods (without synthetic pesticides or additives; animals can roam freely etc.)",
        "Ethically or sustainably sourced": "Produced under fair labour/environmental standards (fairtrade, MSC fish etc.)",
        "Health food or drink": "Marketed as beneficial to health (high in nutrients, low in harmful ingredients)",
        "Unhealthy option": "High in salt/sugar/fat; indulgent or treat foods (excludes alcohol)",
        "Low calorie alternative": "Explicitly reduced fat or calorie version of a product",
        "Alcohol free alternative": "Version of alcoholic product with zero or negligible alcohol",
        "Decaf alternative": "Version of caffeinated product like coffee or tea with most caffeine removed",
        "British cuisine": "Food items & ingredients strongly tied to traditional British cuisine (e.g., Yorkshire pudding, shepherd's pie) or labeled as British style (food only).",
        "American cuisine": "Food items & ingredients strongly tied to traditional or iconic American cuisine (e.g., BBQ sauce, burgers, fried chicken) or labeled as American style (food only).",
        "Italian cuisine": "Food items & ingredients strongly tied to traditional Italian cuisine (e.g., pasta, pizza, risotto) or labeled as Italian style (food only).",
        "Mediterranean cuisine": "Food items & ingredients strongly tied to cuisines from the Mediterranean region (e.g., Greek moussaka, Spanish paella) or labeled as Mediterranean style (food only).",
        "French cuisine": "Food items & ingredients strongly tied to traditional French cuisine (e.g., croissants, pâté, ratatouille) or labeled as French style (food only).",
        "Eastern European cuisine": "Food items & ingredients strongly tied to cuisines from Eastern European countries (e.g., Polish pierogi, Russian borscht, Hungarian goulash) or labeled as Eastern European style (food only).",
        "Middle Eastern cuisine": "Food items & ingredients strongly tied to traditional Middle Eastern cuisine (e.g., hummus, falafel, shawarma) or labeled as Middle Eastern style (food only).",
        "South Asian cuisine": "Food items & ingredients strongly tied to traditional Indian cuisine, often characterized by spices (e.g., turmeric, chili, cumin) and dishes like curries, naan, and biryani, or labeled as Indian style (food only).",
        "Chinese cuisine": "Food items & ingredients strongly tied to traditional Chinese cuisine (e.g., stir-fries, noodles, dumplings) or labeled as Chinese style (food only).",
        "Korean cuisine": "Food items & ingredients strongly tied to traditional Korean cuisine (e.g., kimchi, gochujang-based dishes, BBQ meats) or labeled as Korean style (food only).",
        "Japanese cuisine": "Food items & ingredients strongly tied to traditional Japanese cuisine (e.g., sushi, ramen, miso soup) or labeled as Japanese style (food only).",
        "Southeast Asian cuisine": "Food items & ingredients strongly tied to traditional Thai cuisine, often balancing sweet, sour, spicy, and salty flavors (e.g., curries, pad Thai) or labeled as Thai style (food only).",
        "Mexican cuisine": "Food items & ingredients strongly tied to traditional Mexican cuisine (e.g., tortillas, beans, chilies, salsas) or labeled as Mexican style (food only).",
        "Caribbean cuisine": "Food items & ingredients strongly tied to traditional Caribbean cuisine (e.g., jerk seasoning, plantains, tropical fruits) or labeled as Caribbean style (food only).",
        "Northern African cuisine": "Food items & ingredients strongly tied to cuisines from Northern Africa (e.g., tagine, couscous, harissa) or labeled as Northern African style (food only).",
        "Sub-Saharan African cuisine": "Food items & ingredients strongly tied to cuisines from Sub-Saharan Africa (e.g., jollof rice, injera, nsima) or labeled as Sub-Saharan African style (food only).",
        "Kosher": "Item specifically prepared under Jewish dietary law",
        "Halal": "Item specifically prepared under Islamic dietary law",
        "Special event or occasion": "Associated with celebrations or seasonal events"
    }
    # Return the matching definition or a fallback message
    return characteristic_definitions.get(characteristic, "Definition not found for this characteristic.")

In [1]:
import pandas as pd
import random
import math
from io import StringIO

# ----------------------------
# Configuration
# ----------------------------

# Default rating distribution (used as a weight/probability)
default_rating_distribution = {
    "Never": 0.0,
    "Almost never": 0.2,
    "Rarely": 0.4,
    "Usually": 0.6,
    "Consistently": 0.8,
    "Always": 1.0
}

# Bespoke distributions (if a tag has one, use it instead of the default)
bespoke_distributions = {
    # Example: for Special event you might define a very low chance
    "Special event or occasion": {
        "Never": 0.0,
        "Almost never": 0.01,
        "Rarely": 0.05,
        "Usually": 0.1,
        "Consistently": 0.2,
        "Always": 1.0
    }
    # You can add others as needed.
}

# Tag opposites (only a subset shown here for demonstration)
tag_opposites = {
    "Fresh": ["Frozen food", "Non food"],
    "Frozen food": ["Fresh", "Non food"],
    "Health food or drink": ["Unhealthy option", "Non food"],
    "Unhealthy option": ["Health food or drink", "Organic or additive free", "Non food"],
    "Organic or additive free": ["Unhealthy option", "Non food"],
    # … (rest of opposites would be added here)
    "Non food": []  # special: we always include a food/non-food column separately.
}

# Which columns are categories and which are flavours (order as in CSV)
category_tags = [
    "Fresh", "Frozen food", "Health food or drink", "Unhealthy option",
    "Low calorie alternative", "Just cook food", "Grab & go prepared food or drink",
    "Alcohol free alternative", "Decaf alternative", "Contains meat",
    "Contains seafood", "Animal produce", "Vegetarian or vegan alternative",
    "Organic or additive free", "Halal", "Kosher", "Special event or occasion",
    "British cuisine", "American cuisine", "French cuisine", "Italian cuisine",
    "Mediterranean cuisine", "Middle Eastern cuisine", "Germanic and Slavic cuisine",
    "Chinese cuisine", "Japanese cuisine", "Korean cuisine", "Southeast Asian cuisine",
    "South Asian cuisine", "Mexican cuisine", "Caribbean cuisine",
    "Sub-Saharan African cuisine", "Northern African cuisine"
]

flavour_tags = [
    "Bitter", "Neutral", "Non food", "Salty", "Savoury", "Sour", "Spicy", "Sweet", "Umami"
]

# Quality distribution for each row (must sum to 1)
quality_distribution = {
    "Budget": 0.25,
    "Standard": 0.50,
    "Premium": 0.25
}

# For “Half” grouping: if a rating string contains the word "Half" (case-insensitive)
def is_half_rating(rating):
    return isinstance(rating, str) and "half" in rating.lower()

# ----------------------------
# Helper Functions
# ----------------------------

def get_probability(tag, rating):
    """
    Given a tag and its rating (from the CSV cell),
    check for a bespoke distribution first; otherwise use default.
    Return the probability (weight) to assign this tag.
    """
    # Use lower-case keys for matching
    rating = rating.strip() if isinstance(rating, str) else "Never"
    dist = bespoke_distributions.get(tag, default_rating_distribution)
    return dist.get(rating, 0.0)

def assign_quality(num_rows):
    """
    For a given number of repeats, precompute quality assignments according to the ratio.
    Returns a shuffled list of quality strings.
    """
    counts = {q: int(round(quality_distribution[q] * num_rows)) for q in quality_distribution}
    # Adjust so that sum equals num_rows
    total = sum(counts.values())
    while total < num_rows:
        counts["Standard"] += 1
        total += 1
    while total > num_rows:
        counts["Standard"] -= 1
        total -= 1
    quality_list = []
    for q, cnt in counts.items():
        quality_list.extend([q] * cnt)
    random.shuffle(quality_list)
    return quality_list

def sample_tags(candidates, n_desired, already_assigned=set()):
    """
    Given a dictionary of candidate tags mapped to probabilities,
    sample exactly n_desired tags (without opposites conflicts).
    already_assigned is a set of tags already chosen for this row (for half groups, etc).
    We use weighted sampling without replacement.
    """
    selected = set()
    # Build a list of (tag, weight) for those not already assigned and not conflicting with already_assigned.
    filtered = []
    for tag, prob in candidates.items():
        # Check opposites: if any opposite of tag is in already_assigned, skip it.
        if any(opp in already_assigned for opp in tag_opposites.get(tag, [])):
            continue
        filtered.append((tag, prob))
    if not filtered:
        return selected
    # Normalize weights:
    total_weight = sum(w for tag, w in filtered)
    if total_weight == 0:
        # if no weight, pick randomly from the available tags
        tags_only = [tag for tag, w in filtered]
        return set(random.sample(tags_only, min(n_desired, len(tags_only))))
    # Do weighted sampling without replacement:
    while len(selected) < n_desired and filtered:
        r = random.uniform(0, total_weight)
        upto = 0
        for i, (tag, weight) in enumerate(filtered):
            upto += weight
            if upto >= r:
                selected.add(tag)
                # Remove any candidates that conflict with tag (using opposites)
                new_filtered = []
                for t, w in filtered:
                    if t == tag or t in tag_opposites.get(tag, []):
                        continue
                    new_filtered.append((t, w))
                filtered = new_filtered
                total_weight = sum(w for t, w in filtered)
                break
    return selected

# ----------------------------
# Main Processing Function
# ----------------------------

def process_dataframe(df):
    output_rows = []

    # For each input row:
    for idx, row in df.iterrows():
        item_name = row["L1"]
        # Decide on repeat count: if L1 contains "other" (case-insensitive), use 7, else 20.
        repeats = 7 if "other" in str(item_name).lower() else 20

        # Pre-calculate quality assignments for these repeats.
        quality_assignments = assign_quality(repeats)

        # For each repeat, we will produce an output row dict.
        for i in range(repeats):
            # We build separate sets for categories and flavours.
            # We record assigned tags (for conflict checking).
            out = {
                "item": item_name,
                "categories": set(),
                "flavours": set(),
                "Quality": quality_assignments[i],
                "Food/non food": None  # will fill below
            }
            # First, for each candidate tag (for categories and flavours), we create a weight.
            # We use the rating from the input row.
            cat_candidates = {}
            for tag in category_tags:
                rating = row.get(tag, "Never")
                # For "Half" tags, mark separately later.
                if is_half_rating(str(rating)):
                    # For now, include with its probability
                    cat_candidates[tag] = get_probability(tag, rating)
                else:
                    cat_candidates[tag] = get_probability(tag, rating)
            flav_candidates = {}
            for tag in flavour_tags:
                rating = row.get(tag, "Never")
                flav_candidates[tag] = get_probability(tag, rating)

            # Also decide the food/non-food field.
            # Here we simply do a random trial: if the "Non food" candidate probability > 0.5 then assign "Non food"
            # Otherwise "Food". (You can adjust this logic as needed.)
            nf_prob = flav_candidates.get("Non food", 0.0)
            out["Food/non food"] = "Non food" if random.random() < nf_prob else "Food"

            # Now sample a random number (between 1 and 5) of category tags.
            num_cats = random.randint(1, 5)
            # But we want to enforce that if any candidate is marked with "Half", exactly one from that subgroup is chosen.
            # For demonstration, we simply treat any category tag with a rating containing "Half" as a separate candidate.
            half_candidates = {tag: wt for tag, wt in cat_candidates.items() if is_half_rating(str(row.get(tag, "Never")))}
            normal_candidates = {tag: wt for tag, wt in cat_candidates.items() if not is_half_rating(str(row.get(tag, "Never")))}

            chosen_half = set()
            if half_candidates:
                # Force exactly one from the half group.
                chosen_half = sample_tags(half_candidates, 1)
            # Now sample the remaining (num_cats - len(chosen_half)) from the normal candidates.
            remaining = num_cats - len(chosen_half)
            chosen_normal = sample_tags(normal_candidates, remaining) if remaining > 0 else set()
            out["categories"] = chosen_half.union(chosen_normal)

            # For flavours, choose a random number between 1 and (all available candidates that pass probability).
            num_flav = random.randint(1, len(flavour_tags))
            out["flavours"] = sample_tags(flav_candidates, num_flav)

            # Enforce opposites: (this code uses our sampling procedure so that conflicting tags were less likely to be chosen)
            # Here you could add additional conflict resolution if desired.

            # Now flatten the categories and flavours into fixed columns.
            # We produce 5 category columns (pad with None) and as many flavour columns as chosen.
            cat_list = list(out["categories"])
            if len(cat_list) < 5:
                cat_list += [None]*(5-len(cat_list))
            else:
                cat_list = cat_list[:5]
            flav_list = list(out["flavours"])
            # Create output dictionary row:
            out_row = {"item": out["item"], "Quality": out["Quality"], "Food/non food": out["Food/non food"]}
            for j in range(5):
                out_row[f"tag{j+1}"] = cat_list[j]
            for j, flav in enumerate(flav_list):
                out_row[f"flav{j+1}"] = flav
            output_rows.append(out_row)
    return pd.DataFrame(output_rows)

# ----------------------------
# Dummy Data for Testing
# ----------------------------

dummy_csv = r"""L3,L2,L1,Fresh,Frozen food,Health food or drink,Unhealthy option,Low calorie alternative,Just cook food,Grab & go prepared food or drink,Alcohol free alternative,Decaf alternative,Contains meat,Contains seafood,Animal produce,Vegetarian or vegan alternative,Organic or additive free,Halal,Kosher,Special event or occasion,British cuisine,American cuisine,French cuisine,Italian cuisine,Mediterranean cuisine,Middle Eastern cuisine,Germanic and Slavic cuisine,Chinese cuisine,Japanese cuisine,Korean cuisine,Southeast Asian cuisine,South Asian cuisine,Mexican cuisine,Caribbean cuisine,Sub-Saharan African cuisine,Northern African cuisine,Bitter,Neutral,Non food,Salty,Savoury,Sour,Spicy,Sweet,Umami
Alcohol,Beer & cider,Ale,Always,Never,Never,Never,Rarely,Never,Never,Rarely,Never,Never,Never,Never,Never,Rarely,Never,Almost never,Almost never,Always,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Always,Never,Never,Almost never,Never,Almost never,Almost never,Rarely,Rarely
Alcohol,Beer & cider,Other beer & cider,Never,Never,Never,Never,Rarely,Never,Never,Rarely,Never,Never,Never,Never,Never,Rarely,Never,Almost never,Almost never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Rarely,Never,Never,Almost never,Never,Rarely,Almost never,Rarely,Almost never
Baby & child,Baby feeding,Baby food,Half,Half,Consistently,Almost never,Never,Always,Rarely,Never,Never,Usually,Rarely,Usually,Rarely,Rarely,Almost never,Almost never,Almost never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Never,Half,Never,Never,Half,Never
"""

# Read dummy CSV into DataFrame.
df_input = pd.read_csv(StringIO(dummy_csv))

# Process the DataFrame.
df_output = process_dataframe(df_input)

# Show output (for testing)
print(df_outpu.head(10))

# Optionally, write to CSV:
# df_output.to_csv("synthetic_output.csv", index=False)


  item   Quality Food/non food                      tag1  \
0  Ale    Budget          Food           British cuisine   
1  Ale  Standard          Food                    Kosher   
2  Ale    Budget          Food  Alcohol free alternative   
3  Ale   Premium          Food           British cuisine   
4  Ale   Premium          Food           British cuisine   
5  Ale  Standard          Food                     Fresh   
6  Ale    Budget          Food           British cuisine   
7  Ale    Budget          Food           British cuisine   
8  Ale   Premium          Food           British cuisine   
9  Ale  Standard          Food           British cuisine   

                       tag2   tag3                      tag4  \
0  Alcohol free alternative  Fresh   Low calorie alternative   
1                      None   None                      None   
2                     Fresh   None                      None   
3                    Kosher  Fresh                      None   
4   Low calorie alt

In [2]:
df_output

Unnamed: 0,item,Quality,Food/non food,tag1,tag2,tag3,tag4,tag5,flav1,flav2,flav3,flav4,flav5,flav6,flav7,flav8,flav9
0,Ale,Budget,Food,British cuisine,Alcohol free alternative,Fresh,Low calorie alternative,Organic or additive free,Sweet,Bitter,Umami,Salty,Sour,Spicy,,,
1,Ale,Standard,Food,Kosher,,,,,Bitter,Salty,Sweet,Spicy,,,,,
2,Ale,Budget,Food,Alcohol free alternative,Fresh,,,,Bitter,Salty,Spicy,,,,,,
3,Ale,Premium,Food,British cuisine,Kosher,Fresh,,,Bitter,Sweet,,,,,,,
4,Ale,Premium,Food,British cuisine,Low calorie alternative,Fresh,Alcohol free alternative,,Sweet,Bitter,Umami,Salty,Sour,Spicy,,,
5,Ale,Standard,Food,Fresh,Organic or additive free,,,,Sweet,Bitter,Non food,Umami,Salty,Sour,Neutral,Savoury,Spicy
6,Ale,Budget,Food,British cuisine,Fresh,,,,Sweet,Bitter,Umami,Salty,Sour,Spicy,,,
7,Ale,Budget,Food,British cuisine,Alcohol free alternative,Fresh,Organic or additive free,,Bitter,Umami,Spicy,,,,,,
8,Ale,Premium,Food,British cuisine,Low calorie alternative,,,,Sweet,Umami,Salty,Sour,Spicy,,,,
9,Ale,Standard,Food,British cuisine,Low calorie alternative,Fresh,Organic or additive free,,Sweet,Bitter,Non food,Umami,Salty,Sour,Neutral,Spicy,


In [None]:
# PBI - add write back for mis categorisation suggestions