In [49]:
import pandas as pd
from collections import Counter, defaultdict

df = pd.read_csv("../IGA/synthetic_data_IGA_cleaned_200k.csv")  # Replace with your actual filename if different
df.head()


Unnamed: 0,item_id,item_name,brand,category,best_price,unit_price,total_price,discount,quantity,unit_type,store_name,location,gender,date,sub_category,transaction_id,customer_id
0,331257,Continental Cup A Soup Classic Chicken Noodle,continental,Pantry,1.081097,1.30758,9.15,0.756768,8.463625,each,IGA CASTLEMAINE,241 BARKER St CASTLEMAINE VIC 3450,Male,03/10/2025,Condiments,IGA-1844548,IGA-4365192975
1,661555,Mainland Sweet Cinnamon Spreadable,mainland,"Dairy, Eggs & Fridge",4.59632,5.64563,39.52,1.53,0.859818,Kg,IGA CRESWICK,48 ALBERT St CRESWICK VIC 3363,Male,04/07/2025,,IGA-6418171,IGA-7582876796
2,875337,Sabco Kitchen Sponge Cloth,sabco,Cleaning & Maintenance,2.024342,1.827336,16.45,,8.126099,each,CARLOS SUPA IGA GILGANDRA,52 MILLER St GILGANDRA NSW 2827,Male,01/13/2025,Surface Cleaners,IGA-4360043,IGA-8386012397
3,20000005528,Dole Pineapple Chunks In Juice,dole pineapple,Pantry,1.112534,1.069433,1.07,,0.096177,Kg,IGA PUTNEY,240 MORRISON Rd PUTNEY NSW 2112,Female,10/24/2025,Condiments,IGA-3463882,IGA-6274867254
4,84672,McKenzie's Ground Cinnamon,mckenzie's,Pantry,4.406025,5.564741,50.08,-3.2,1.136625,Kg,IGA PENSHURST,11 Bridge St PENSHURST NSW 2222,Female,01/04/2025,Condiments,IGA-5562825,IGA-6915098402


In [50]:
import pandas as pd


# Split the DataFrame into two equal parts
halfway = len(df) // 2
df_part1 = df.iloc[:halfway]
df_part2 = df.iloc[halfway:]

# Save the two parts to separate CSV files
df_part1.to_csv('../IGA/synthetic_data_IGA_cleaned_200k_part1.csv', index=False)
df_part2.to_csv('../IGA/synthetic_data_IGA_cleaned_200k_part2.csv', index=False)

print("CSV file has been split into two parts: part1.csv and part2.csv")


CSV file has been split into two parts: part1.csv and part2.csv


In [41]:
from collections import Counter, defaultdict

def extract_and_fill_brands(df):
    item_names = df['item_name'].dropna().unique()
    tokenized_names = [name.lower().split() for name in item_names]

    def get_ngram_counts(n):
        ngram_counter = Counter()
        for tokens in tokenized_names:
            if len(tokens) >= n:
                ngram = tuple(tokens[:n])
                ngram_counter[ngram] += 1
        return ngram_counter

    # Count all first word occurrences
    first_word_counts = Counter(tokens[0] for tokens in tokenized_names if tokens)

    brand_candidates = defaultdict(list)

    for tokens in tokenized_names:
        if not tokens:
            continue

        # If the first word occurs only once, treat it directly as brand
        first_word = tokens[0]
        if len(first_word) > 1 and first_word_counts[first_word] == 1:
            brand_candidates[first_word].append(" ".join(tokens))
            continue  # skip the rest of logic for this row

        # Otherwise use prefix detection
        start = 1 if len(tokens[0]) == 1 and not tokens[0].isnumeric() else 0
        i = start + 1
        while i <= len(tokens):
            prefix = tuple(tokens[start:i])
            prefix_count = get_ngram_counts(len(prefix))[prefix]

            if i == start + 1:
                i += 1
                continue

            prev_prefix = tuple(tokens[start:i - 1])
            prev_count = get_ngram_counts(len(prev_prefix))[prev_prefix]

            if prefix_count == prev_count:
                i += 1
            else:
                break

        brand_prefix = tuple(tokens[start:i - 1])
        if brand_prefix:
            brand_name = " ".join(brand_prefix)
            brand_candidates[brand_name].append(" ".join(tokens))

    # 🟡 Prepare sorted brand list (longer names first to avoid partial overlaps)
    detected_brands = sorted(brand_candidates.keys(), key=lambda x: -len(x))

    # 🔁 Apply detected brand to each item_name
    def detect_brand(name):
        name_lower = name.lower()
        for brand in detected_brands:
            if name_lower.startswith(brand):  # more precise match
                return brand
        return None

    df['brand'] = df['item_name'].apply(lambda x: detect_brand(x) if isinstance(x, str) else None)
    df['quantity'] = df.apply(
        lambda row: (
            row['total_price'] / (row['best_price'] * 10)
            if pd.notnull(row['total_price']) and pd.notnull(row['best_price']) and row['unit_type'] != 'each'
            else (
                row['total_price'] / row['best_price']
                if pd.notnull(row['total_price']) and pd.notnull(row['best_price'])
                else row['quantity']
            )
        ),
        axis=1
    )
    return df


In [42]:
import pandas as pd
from collections import Counter, defaultdict
from io import StringIO


csv_data = """item_id,item_name,brand,category,best_price,unit_price,total_price,discount,quantity,unit_type,store_name,location,gender,date,sub_category,transaction_id,customer_id
357091,Balducci Penne Rigati No. 18,B,Pantry,0.35,0.35,1.75,1.6,0.5,Kg,NINAS IGA HAMILTON,73 BEAUMONT St HAMILTON NSW 2303,Male,4/2/2025,Condiments,IGA-7387936,IGA-9DAE9E12FF
356817,Balducci Pennette No. 59,B,Pantry,0.35,0.35,1.75,1.6,0.5,Kg,FLYNN DRIVE IGA,332 FLYNN Dr ALICE SPRINGS NT 870,Male,3/31/2025,Condiments,IGA-E181247,IGA-C702DC74D5
331257,Continental Cup A Soup Classic Chicken Noodle,Continental,Pantry,1.0810971112456174,1.3075796556550356,9.15,0.7567679778719321,3.6,each,IGA CASTLEMAINE,241 BARKER St CASTLEMAINE VIC 3450,Male,03/10/2025,Condiments,IGA-1844548,IGA-4365192975
661555,Mainland Sweet Cinnamon Spreadable,Mainland,"Dairy, Eggs & Fridge",4.596320077843497,5.645629557158511,39.52,1.53,3.45,Kg,IGA CRESWICK,48 ALBERT St CRESWICK VIC 3363,Male,04/07/2025,NA,IGA-6418171,IGA-7582876796
875337,Sabco Kitchen Sponge Cloth,Sabco,Cleaning & Maintenance,2.024341552925762,1.8273355764650705,16.45,NA,1.53,each,CARLOS SUPA IGA GILGANDRA,52 MILLER St GILGANDRA NSW 2827,Male,01/13/2025,Surface Cleaners,IGA-4360043,IGA-8386012397
20000005528,Dole Pineapple Chunks In Juice,Dole Pineapple,Pantry,1.1125344824097119,1.0694330429913028,1.07,NA,4.3,Kg,IGA PUTNEY,240 MORRISON Rd PUTNEY NSW 2112,Female,10/24/2025,Condiments,IGA-3463882,IGA-6274867254
84672,McKenzie's Ground Cinnamon,McKenzie's,Pantry,4.406025037751533,5.564740985163115,50.08,-3.2,1.3,Kg,IGA PENSHURST,11 Bridge St PENSHURST NSW 2222,Female,01/04/2025,Condiments,IGA-5562825,IGA-6915098402
20000006749,Old El Paso Mexican Burrito Tortillas,Old,Pantry,0.9122968163494953,1.1001025616246392,2.2,NA,2.56,each,IGA LYNDHURST,Shop 2 / 43 LYNDHURST Dr BOMADERRY NSW 2541,Male,06/14/2025,Condiments,IGA-7001799,IGA-5521801162
200406,Uncle Tobys Oats Quick Sachets Breakfast Cereal Original,U,Pantry,0.6511991968899965,0.7548701840676729,6.79,0.4558394378229975,1.12,each,IGA NGUNNAWAL,4 RILEY St NGUNNAWAL ACT 2913,Male,06/29/2025,Condiments,IGA-7340092,IGA-8493062851
190000000000,Wonka Runts Fruit Candy,Wonka,Snacks & Confectionery,0.8993628539973869,1.0769995927642164,9.69,NA,2.47,Kg,IGA WEST DUBBO,38-40 VICTORIA St WEST DUBBO NSW 2830,Male,10/09/2025,Crackers,IGA-5280737,IGA-2809460108
22963,Black & Gold Creamed Corn,B,Pantry,0.5,0.5,1.5,NA,2.61,Kg,SUPA IGA NORTH MELBOURNE,20-26 ERROLL St NORTH VIC 3051,Male,02/06/2025,Condiments,IGA-8955996,IGA-9674856601
20000006338,Bref Power Active Toilet Cleaner Gel Flowers,B,Cleaning & Maintenance,0.7430417067138478,0.6424341967863665,3.85,0.5201291946996934,1.85,L,KHANS IGA COONAMBLE,43 ABERFORD St COONAMBLE NSW 2829,Female,08/26/2025,Surface Cleaners,IGA-7795866,IGA-6429108095
14512,Devondale 100% Pure Full Cream Long Life Milk,De,"Dairy, Eggs & Fridge",1.1387446759874003,0.8423440381509921,0.84,NA,0.83,NA,IGA DULWICH HILL,398-400 NEW CANTERBURY Rd DULWICH HILL NSW 2203,Female,07/06/2025,NA,IGA-4171306,IGA-4717471165
160000000000,Maggi Mexican Nachos Recipe Base,Maggi,Pantry,2.242291697538458,1.7584910890355394,1.76,-2.25,4.11,Kg,IGA XPRESS HAWKS NEST,46 TULOA Ave HAWKS NEST NSW 2324,Female,06/07/2025,Condiments,IGA-5256160,IGA-7691241311
261753,"Lindt Excellence Cranberry, Almond & Hazelnut Dark Chocolate",Lindt,Snacks & Confectionery,1.8237139524088706,1.506815046374023,3.01,0.8,3.14,Kg,IGA WOONONA,4 to 10 RUSSELL St WOONONA NSW 2517,Male,12/28/2025,Crackers,IGA-6484185,IGA-9412538777
190000000000,Baker's Oven Garlic Bread Twin Pack,B,Pantry,1.0868532514516265,1.0581682629198665,5.29,NA,1.33,Kg,IGA WOONONA,4 to 10 RUSSELL St WOONONA NSW 2517,Male,11/29/2025,Condiments,IGA-4785496,IGA-0690121777
160000000000,Schweppes Agrum Collection Blood Orange,Schweppes,Drinks,1.5106468833284452,1.3110573707861635,7.87,1.0574528183299117,1.03,each,CARLOS IGA MT TAMBORINE,27 MAIN WESTERN Rd MOUNT QLD 4272,Male,02/21/2025,Juices,IGA-7694560,IGA-8396571553
656160,Priya Natures Sandalwood Soap,Priya,Beauty & Personal Care,1.9460672767600045,2.518233579649353,5.04,NA,0.76,Kg,SUPA IGA GULGONG,90-100 MAYNE St GULGONG NSW 2852,Male,04/29/2025,NA,IGA-9501414,IGA-4072201780
50372,Sunbeam Currants,Sunbeam,Pantry,0.5826751642164308,0.5738837574264432,1.15,0.40787261495150157,4.92,Kg,IGA SANCTUARY POINT,14 PARADISE BEACH Rd SANCTUARY POINT NSW 2540,Male,07/24/2025,Condiments,IGA-1183623,IGA-1585271234
20000001755,Fairy Cooking Margarine,Fairy,"Dairy, Eggs & Fridge",0.7083956171125174,0.6132663378973228,4.91,NA,4.21,Kg,IGA XPRESS EAST BRIGHTON,765A HAWTHORN Rd EAST BRIGHTON VIC 3165,Male,09/06/2025,NA,IGA-6581269,IGA-7990356823
180000000000,Cadbury Reindeer Cakes,Cadbury,Snacks & Confectionery,4.390844282516744,3.688912851256818,3.69,NA,3.4,Kg,IGA KOOTINGAL,5 DENMAN Ave KOOTINGAL NSW 2352,Female,06/30/2025,Crackers,IGA-5832649,IGA-1229474007"""

df = pd.read_csv(StringIO(csv_data))
df.head(15)


Unnamed: 0,item_id,item_name,brand,category,best_price,unit_price,total_price,discount,quantity,unit_type,store_name,location,gender,date,sub_category,transaction_id,customer_id
0,357091,Balducci Penne Rigati No. 18,B,Pantry,0.35,0.35,1.75,1.6,0.5,Kg,NINAS IGA HAMILTON,73 BEAUMONT St HAMILTON NSW 2303,Male,4/2/2025,Condiments,IGA-7387936,IGA-9DAE9E12FF
1,356817,Balducci Pennette No. 59,B,Pantry,0.35,0.35,1.75,1.6,0.5,Kg,FLYNN DRIVE IGA,332 FLYNN Dr ALICE SPRINGS NT 870,Male,3/31/2025,Condiments,IGA-E181247,IGA-C702DC74D5
2,331257,Continental Cup A Soup Classic Chicken Noodle,Continental,Pantry,1.081097,1.30758,9.15,0.756768,3.6,each,IGA CASTLEMAINE,241 BARKER St CASTLEMAINE VIC 3450,Male,03/10/2025,Condiments,IGA-1844548,IGA-4365192975
3,661555,Mainland Sweet Cinnamon Spreadable,Mainland,"Dairy, Eggs & Fridge",4.59632,5.64563,39.52,1.53,3.45,Kg,IGA CRESWICK,48 ALBERT St CRESWICK VIC 3363,Male,04/07/2025,,IGA-6418171,IGA-7582876796
4,875337,Sabco Kitchen Sponge Cloth,Sabco,Cleaning & Maintenance,2.024342,1.827336,16.45,,1.53,each,CARLOS SUPA IGA GILGANDRA,52 MILLER St GILGANDRA NSW 2827,Male,01/13/2025,Surface Cleaners,IGA-4360043,IGA-8386012397
5,20000005528,Dole Pineapple Chunks In Juice,Dole Pineapple,Pantry,1.112534,1.069433,1.07,,4.3,Kg,IGA PUTNEY,240 MORRISON Rd PUTNEY NSW 2112,Female,10/24/2025,Condiments,IGA-3463882,IGA-6274867254
6,84672,McKenzie's Ground Cinnamon,McKenzie's,Pantry,4.406025,5.564741,50.08,-3.2,1.3,Kg,IGA PENSHURST,11 Bridge St PENSHURST NSW 2222,Female,01/04/2025,Condiments,IGA-5562825,IGA-6915098402
7,20000006749,Old El Paso Mexican Burrito Tortillas,Old,Pantry,0.912297,1.100103,2.2,,2.56,each,IGA LYNDHURST,Shop 2 / 43 LYNDHURST Dr BOMADERRY NSW 2541,Male,06/14/2025,Condiments,IGA-7001799,IGA-5521801162
8,200406,Uncle Tobys Oats Quick Sachets Breakfast Cerea...,U,Pantry,0.651199,0.75487,6.79,0.455839,1.12,each,IGA NGUNNAWAL,4 RILEY St NGUNNAWAL ACT 2913,Male,06/29/2025,Condiments,IGA-7340092,IGA-8493062851
9,190000000000,Wonka Runts Fruit Candy,Wonka,Snacks & Confectionery,0.899363,1.077,9.69,,2.47,Kg,IGA WEST DUBBO,38-40 VICTORIA St WEST DUBBO NSW 2830,Male,10/09/2025,Crackers,IGA-5280737,IGA-2809460108


In [45]:
df = extract_and_fill_brands(df)

print(df.head(15))

         item_id                                          item_name  \
0         331257      Continental Cup A Soup Classic Chicken Noodle   
1         661555                 Mainland Sweet Cinnamon Spreadable   
2         875337                         Sabco Kitchen Sponge Cloth   
3    20000005528                     Dole Pineapple Chunks In Juice   
4          84672                         McKenzie's Ground Cinnamon   
5    20000006749              Old El Paso Mexican Burrito Tortillas   
6         200406  Uncle Tobys Oats Quick Sachets Breakfast Cerea...   
7   190000000000                            Wonka Runts Fruit Candy   
8          22963                          Black & Gold Creamed Corn   
9    20000006338       Bref Power Active Toilet Cleaner Gel Flowers   
10         14512      Devondale 100% Pure Full Cream Long Life Milk   
11  160000000000                   Maggi Mexican Nachos Recipe Base   
12        261753  Lindt Excellence Cranberry, Almond & Hazelnut ...   
13  19

In [46]:
df.to_csv('../IGA/synthetic_data_IGA_cleaned_200k.csv', index=False)