In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
def remove_plural_s(ingredients, letters_to_replace, replace_with):
    """
    Removes the trailing 's' from a word, handling exceptions.

    Args:
        ingredient: The ingredient string.

    Returns:
        The modified ingredient string.
    """
    list_ingredients = []
    for ingredient in ingredients:
        ingredient = ingredient.lower()
        if letters_to_replace.lower() == "s":
            if ingredient.lower().endswith(letters_to_replace):
                if len(ingredient) > 1 and ingredient[-2] not in ["a", "e", "i", "o", "u", "s"]:
                    list_ingredients.append(ingredient[:-1])
                else:
                    list_ingredients.append(ingredient)
            else:
                list_ingredients.append(ingredient)
        else:
            if len(ingredient) > len(letters_to_replace) and ingredient.endswith(letters_to_replace):
                list_ingredients.append(ingredient[:-len(letters_to_replace)] + replace_with)
            else:
                list_ingredients.append(ingredient)
    return list_ingredients

def counting(data_list, to_count):
    counts = Counter()
    for small_list in data_list:
        for item in small_list:
            if item.endswith(to_count):
                counts[item] += 1

    return counts

def remove_words(ingredient_list, words_to_remove):
    """
    Removes specified words from ingredient strings within a list.

    Args:
        ingredient_list: A list of ingredient strings.
        words_to_remove: A set of words to remove (case-insensitive).

    Returns:
        A new list with modified ingredient strings.
    """
    modified_list = []
    for ingredient in ingredient_list:
        words = ingredient.split()  # Split into words
        filtered_words = [word for word in words if word.lower() not in words_to_remove]
        modified_ingredient = " ".join(filtered_words)  # Reconstruct the string
        modified_list.append(modified_ingredient)
        modified_list = [item for item in modified_list if item !=""]
    return modified_list

In [None]:
df = pd.read_pickle('../Data/Raw Datasets/food.pkl')
df.head()

Unnamed: 0,id,ingredients
0,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,"[water, vegetable oil, wheat, salt]"
4,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
counting(data_list = df['ingredients'], to_count="")

Counter({'salt': 364164,
         'eggs': 333939,
         'tsp salt': 260887,
         'c sugar': 245022,
         'teaspoon salt': 240383,
         'butter': 206049,
         'sugar': 176361,
         'water': 143028,
         'onion': 138127,
         'egg': 135707,
         'olive oil': 132838,
         'tsp vanilla': 127328,
         'c flour': 126317,
         'c milk': 111533,
         'garlic cloves': 109932,
         'pepper': 103049,
         'cup sugar': 98631,
         'milk': 96124,
         'flour': 95412,
         'c water': 82444,
         'all-purpose flour': 70577,
         'garlic': 70182,
         'tsp baking powder': 68563,
         'brown sugar': 65348,
         'cup water': 62011,
         'baking powder': 61120,
         'tsp pepper': 56204,
         'lemon juice': 56197,
         'tsp cinnamon': 54918,
         'tablespoons olive oil': 54420,
         'black pepper': 52711,
         'onions': 52589,
         'cup milk': 51397,
         'teaspoon pepper': 51353,

### We can see a large amount of words that are unneccesary to our goal
- will create a list (large) through study of the dataset of words to remove
- we also don't need words like "flour" and "salt" to offset the predictions, nobody wants to hear "You know what goes really well with chicken? Flour!"

In [20]:
words_to_drop = {
    '° °', '&', '1%', '2%', 'a', 'about', 'accent', 'according', 'add', 'added', 'additional', 'all',
    'all-purpose', 'allpurpose', 'also', 'amount', 'amp', 'and', 'another', 'any', 'approx',
    'approximately', 'artificial', 'as', 'at', 'available', 'bag', 'bags', 'baking', 'baking soda',
    'bar', 'basic', 'beaten', 'beaters', 'best', 'better', 'big', 'bite', 'bitesize', 'bitesized', 'bits',
    'black pepper', 'blanched', 'blend', 'block', 'boiled', 'boiling', 'bone', 'boned', 'boneless',
    'bottle', 'bottled', 'bottles', 'bottom', 'box', 'boxes', 'brewed', 'broiler-fryer', 'broken',
    'brown', 'bsp', 'bunch', 'bunches', 'butter', 'buttered', 'buy', 'c', 'cake', 'called', 'can',
    'canned', 'cans', 'carton', 'center', 'chilled', 'chip', 'choice', 'chopped', 'chunk', 'clean',
    'cleaned', 'clear', 'club', 'cm', 'coarsely', 'coarse', 'coating', 'coconut oil', 'cocktail', 'cold',
    'colored', 'coloring', 'condensed', 'confectioners', "confectioners'", 'container', 'cooked', 'cooking', 'cornflour',
    'cornstarch', 'count', 'country', 'cover', 'crisp', 'crumbled', 'crumbles', 'crunchy', 'crushed',
    'crystallized', 'ct', 'cube', 'cubed', 'cubes', 'cup', 'cups', 'cut', 'dairy', 'dash', 'dashes', 'day',
    'dayold', 'de', 'deboned', 'defrosted', 'depending', 'desired', 'desseded', 'deveined', 'devil', 'diagonal',
    'diagonally', 'diameter', 'dice', 'diced', 'diet', 'dinner', 'dip', 'discarded', 'dish', 'dissolved',
    'divided', 'dont', 'dough', 'doz', 'dozen', 'drain', 'drained', 'dream', 'dried', 'drizzle', 'drops', 'dry',
    'dusting', 'ea', 'each', 'ears', 'either', 'ends', 'enough', 'envelope', 'equal', 'equipment', 'equivalent',
    'etc', 'evoo', 'exess', 'extract', 'extra', 'extravirgin', 'f', 'farm', 'fat', 'fatfree', 'favorite', 'filling', 'fine', 'find',
    'finely', 'firm', 'firmly', 'fl', 'flat', 'float', 'flour', 'foil', 'food', 'foods', 'for', 'found', 'four', 'free',
    'fresh', 'freshly', 'frozen', 'frzn', 'fry', 'fryer', 'frying', 'full', 'fully', 'g', 'gal', 'gallon', 'gallons', 'garlic',
    'garlic cloves', 'garnish', 'gelatin', 'generous', 'get', 'gluten', 'gm', 'goodquality', 'gram', 'grams', 'granulated', 'grated', 'great',
    'greasing', 'grill', 'grilled', 'grm', 'ground', 'halved', 'halves', 'handful', 'harcooked', 'hard', 'hard-boiled', 'hardboiled',
    'head', 'healthy', 'heat', 'heated', 'heaping', 'hellmanns', 'hidden', 'homemade', 'hot', 'hulled', 'i', 'ice', 'if', 'in', 'inch',
    'inches', 'inchthick', 'including', 'ingredient', 'ingredients', 'instant', 'into', 'jar', 'jarred', 'jars', 'juiced', 'julienne',
    'julienned', 'k', 'kernels', 'key', 'kg', 'kind', 'kitchen', 'kraft', 'l', 'large', 'layer', 'lb', 'lbs', 'lean', 'least', 'leave',
    'leftover', 'lengthwise', 'lengths', 'less', 'lesssodium', 'level', 'lg', 'lightly', 'like', 'liter', 'liters', 'little', 'loosely', 'low', 'lowfat',
    'lowsodium', 'lrg', 'lukewarm', 'lump', 'made', 'make', 'makes', 'margarine', 'mashed', 'med', 'medium', 'mediumsize', 'mediumsized',
    'melted', 'mg', 'milligram', 'milligrams', 'milliliter', 'milliliters', 'minced', 'mini', 'miniature', 'minute', 'minutes', 'mix',
    'ml', 'mms', 'more', 'mrs', 'much', 'n', 'necessary', 'need', 'needed', 'new', 'no', 'none', 'nonfat', 'nonstick', 'not', 'note', 'notes',
    'nutritional', 'o', 'of', 'oil', 'on', 'one', 'ones', 'optional', 'or', 'organic', 'ounce', 'ounces', 'oven', 'overnight', 'oz',
    'oz.', 'pack', 'package', 'packages', 'packed', 'packet', 'packets', 'page', 'partially', 'part-skim', 'parts', 'paste', 'patted',
    'pcs', 'peeled', 'per', 'philadelphia', 'pie', 'piece', 'pieces', 'pinch', 'pint', 'pints', 'pitted', 'pkg', 'pkt', 'plain', 'plus',
    'possibly', 'pot', 'pound', 'pounds', 'powder', 'powdered', 'precooked', 'prefer', 'preferably', 'preference', 'prepared', 'preserves',
    'pressed', 'process', 'processed', 'processor', 'product', 'pt', 'pureed', 'purpose', 'pwdr', 'qt', 'quart', 'quartered', 'quarters',
    'quarts', 'quick-cooking', 'raw', 'ready', 'recommended', 'reduced', 'reduced-sodium', 'reducedsodium', 'refrigerated', 'regular', 'remove', 'reserved',
    'room', 'rotel', 'rough', 'roughly', 'rounded', 'rounds', 'rubbed', 'safeway', 'salt', 'salted', 'san', 'sanwich', 'sauce', 'save',
    'scalded', 'scraped', 'scrubbed', 'sea', 'seasoned', 'seasons', 'sec', 'sections', 'see', 'seeded', 'self-rising', 'seperated', 'separated',
    'serving', 'several', 'shaved', 'sheet', 'sheets', 'shelled', 'shredded', 'shucked', 'side', 'sifted', 'simple', 'size', 'sized', 'skinless',
    'skinned', 'slice', 'sliced', 'slices', 'slightly', 'slivered', 'sm', 'small', 'smashed', 'snipped', 'soaked', 'softened', 'softend', 'sodium',
    'solid', 'soup', 'sour cream', 'splash', 'spray', 'sprig', 'sprigs', 'sprinkle', 'sprinkling', 'sq', 'square', 'squares', 'squeezed', 'steamed',
    'stem', 'stemmed', 'stewing', 'stick', 'sticks', 'storebought', 'stove', 'strained', 'string', 'strip', 'strips', 'stuffed', 'sub', 'substitute',
    'substituted', 'sugar', 'sweetener', 't', 'tablespoon', 'tablespoons', 'taste', 'tb', 'tbl', 'tbls', 'tbs', 'tbsp', 'teaspoon', 'teaspoons', 'temp',
    'temperature', 'thawed', 'thick', 'thickly', 'thin', 'thinly', 'thread', 'three', 'thru', 'time', 'tiny', 'tips', 'to', 'toasted', 'together', 'tomato',
    'toppings', 'tops', 'total', 'tots', 'triple', 'tsp', 'tub', 'two', 'type', 'unbaked', 'unbeaten', 'uncle', 'uncooked', 'undiluted', 'undrained',
    'unflavored', 'unpeeled', 'unsalted', 'unsifted', 'up', 'use', 'used', 'using', 'usually', 'v', 'variety', 'virgin', 'want', 'warm', 'warmed', 'washed',
    'water', 'wedge', 'weight', 'well', 'whatever', 'whipping', 'white', 'whites', 'whole', 'wide', 'with', 'without', 'wooden', 'work', 'would', 'x',
    'yeast', 'yolks', 'you', 'your', 'zested',
}

In [21]:
df['clean_ingredients'] = df['ingredients'].apply(lambda x: remove_words(x, words_to_drop))
df.head()


Unnamed: 0,id,ingredients,clean_ingredients
0,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[pepper, tomatoes, black pepper, thyme, eggs, ..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, mayonaise, green chilies, chick..."
3,22213,"[water, vegetable oil, wheat, salt]","[vegetable, wheat]"
4,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cayenne pepper, onion..."


### Lots of words ending in "s" will reduce visibility
 - I don't need "eggs" in one spot and "egg" in another

In [22]:
df['clean_ingredients'] = df['clean_ingredients'].apply(lambda x: remove_plural_s(x, letters_to_replace="s", replace_with=""))
counting(df['clean_ingredients'], "s")

Counter({'cloves': 359367,
         'tomatoes': 179838,
         'potatoes': 96656,
         'red pepper flakes': 48734,
         'strawberries': 41464,
         'chives': 36616,
         'bananas': 34096,
         'bay leaves': 32616,
         'apples': 30265,
         'cranberries': 26999,
         'basil leaves': 24514,
         'peas': 24470,
         'blueberries': 24419,
         'green chilies': 23760,
         'tortillas': 23751,
         'black olives': 23027,
         'molasses': 21742,
         'cherry tomatoes': 19026,
         'sweet potatoes': 18970,
         'dates': 17004,
         'raspberries': 14787,
         'parsley flakes': 14413,
         'cilantro leaves': 13589,
         'corn tortillas': 13466,
         'mint leaves': 13396,
         'thyme leaves': 13257,
         'plum tomatoes': 12392,
         'peaches': 12305,
         'mandarin oranges': 11921,
         'asparagus': 11540,
         'stewed tomatoes': 10932,
         'parsley leaves': 10415,
         'red

### We still have other plurals
 - lets convert "ies" to "y" 
 - lets convert "oes" to "o"

In [23]:
df['clean_ingredients'] = df['clean_ingredients'].apply(lambda x: remove_plural_s(x, letters_to_replace="ies", replace_with="y"))
df.head()

Unnamed: 0,id,ingredients,clean_ingredients
0,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[pepper, tomatoes, black pepper, thyme, egg, g..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, mayonaise, green chily, chicken ..."
3,22213,"[water, vegetable oil, wheat, salt]","[vegetable, wheat]"
4,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cayenne pepper, onion,..."


In [24]:
counting(df['clean_ingredients'], "ies")

Counter()

In [25]:
df['clean_ingredients'] = df['clean_ingredients'].apply(lambda x: remove_plural_s(x, letters_to_replace="oes", replace_with="o"))
df.head()

Unnamed: 0,id,ingredients,clean_ingredients
0,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomato, ..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[pepper, tomato, black pepper, thyme, egg, gre..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, mayonaise, green chily, chicken ..."
3,22213,"[water, vegetable oil, wheat, salt]","[vegetable, wheat]"
4,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cayenne pepper, onion,..."


In [26]:
counting(df['clean_ingredients'], "oes")

Counter()

In [27]:
counters = counting(data_list = df['clean_ingredients'], to_count="")
counters

Counter({'egg': 847536,
         'onion': 696568,
         'pepper': 505680,
         'olive': 433019,
         'milk': 421965,
         'vanilla': 376812,
         'cloves': 359367,
         'black pepper': 288154,
         'cinnamon': 261298,
         'lemon juice': 229158,
         'soda': 217104,
         'tomato': 179838,
         'parsley': 163236,
         'sour cream': 161162,
         'carrot': 157705,
         'cream cheese': 155799,
         'parmesan cheese': 145850,
         'clove': 138487,
         'ginger': 132896,
         'vegetable': 125912,
         'green onion': 120729,
         'chicken breast': 117080,
         'celery': 116783,
         'beef': 116102,
         'cheddar cheese': 114880,
         'pecan': 111680,
         'potato': 111569,
         'soy': 110857,
         'nutmeg': 110276,
         'mayonnaise': 110059,
         'mushroom': 104583,
         'chicken broth': 103901,
         'oregano': 103107,
         'honey': 101815,
         'cumin': 101315,
 

In [28]:
words_to_replace = {
    'halfandhalf': 'half-and-half cream',
    'half half': 'half-and-half cream',
    'half-and-half': 'half-and-half cream',
    'lowfat sour cream': 'light sour cream',
    'marshmallow creme': 'marshmallow cream',
    'egg egg': 'egg',

}

In [29]:
df.shape

(3064844, 3)

In [30]:
df = df.dropna()

In [31]:
df.shape

(3064844, 3)

In [32]:
non_list_rows = df[df['clean_ingredients'].apply(lambda x: not isinstance(x, list))]
non_list_rows

Unnamed: 0,id,ingredients,clean_ingredients


In [33]:
counters = counting(data_list = df['clean_ingredients'], to_count="")
counters


Counter({'egg': 847536,
         'onion': 696568,
         'pepper': 505680,
         'olive': 433019,
         'milk': 421965,
         'vanilla': 376812,
         'cloves': 359367,
         'black pepper': 288154,
         'cinnamon': 261298,
         'lemon juice': 229158,
         'soda': 217104,
         'tomato': 179838,
         'parsley': 163236,
         'sour cream': 161162,
         'carrot': 157705,
         'cream cheese': 155799,
         'parmesan cheese': 145850,
         'clove': 138487,
         'ginger': 132896,
         'vegetable': 125912,
         'green onion': 120729,
         'chicken breast': 117080,
         'celery': 116783,
         'beef': 116102,
         'cheddar cheese': 114880,
         'pecan': 111680,
         'potato': 111569,
         'soy': 110857,
         'nutmeg': 110276,
         'mayonnaise': 110059,
         'mushroom': 104583,
         'chicken broth': 103901,
         'oregano': 103107,
         'honey': 101815,
         'cumin': 101315,
 

In [40]:
num_to_drop = 1500
filtered_counter = {item for item, count in counters.items() if count <= num_to_drop}
filtered_counter

{'penne rigatoni pasta al dente',
 'syrian rice',
 'spices dill',
 'easy melt cheese',
 'start half the it sweet',
 'bonito flakes preflaked katsuobushi',
 'flesh fish snapper dorado grouper',
 'icing sugarwater',
 'eastern black walnut',
 'chicken stock suggest crock chicken stock',
 'ripe pears bartlett anjou half',
 'duncan hines devils pan',
 'cornish hens chicken skin',
 'cherry red raspberry',
 'periperi rub',
 'assorted vegetable dipper',
 'texture bread',
 'corn liquid removed corn',
 'bacon grease canola ham dripping',
 'tangerine clementines belly',
 'bell peppers eighth',
 'bread crumbs eggs dipping',
 'cream style cream cheese strawberry',
 'japaneseeggplant',
 'penzeys fennel seed',
 'asian guacamole',
 'pasta frolla short crust pastry',
 'chicken broth listed stuffing liquid',
 'comice bartlett pears cored eighth',
 'berries husband prefers blueberry',
 'chicken eg drumstick thighs chicken fillet',
 'shrimp heads red',
 'cloves pernil',
 'dates apricots fruit',
 'good saf

In [41]:
def remove_items(data_list, items_to_drop):
    modified_list = []
    for item in data_list:
        if item not in items_to_drop:
            modified_list.append(item)
    return modified_list




In [42]:
df['clean_ingredients'] = df['clean_ingredients'].apply(lambda x: remove_items(x, filtered_counter))
df.head()

Unnamed: 0,id,ingredients,clean_ingredients
0,10259,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomato, ..."
1,25693,"[plain flour, ground pepper, salt, tomatoes, g...","[pepper, tomato, black pepper, thyme, egg, gre..."
2,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[egg, pepper, mayonaise, green chily, chicken ..."
3,22213,"[water, vegetable oil, wheat, salt]","[vegetable, wheat]"
4,13162,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallot, cayenne pepper, onion,..."


In [43]:
counting(data_list = df['clean_ingredients'], to_count="")

Counter({'egg': 847536,
         'onion': 696568,
         'pepper': 505680,
         'olive': 433019,
         'milk': 421965,
         'vanilla': 376812,
         'cloves': 359367,
         'black pepper': 288154,
         'cinnamon': 261298,
         'lemon juice': 229158,
         'soda': 217104,
         'tomato': 179838,
         'parsley': 163236,
         'sour cream': 161162,
         'carrot': 157705,
         'cream cheese': 155799,
         'parmesan cheese': 145850,
         'clove': 138487,
         'ginger': 132896,
         'vegetable': 125912,
         'green onion': 120729,
         'chicken breast': 117080,
         'celery': 116783,
         'beef': 116102,
         'cheddar cheese': 114880,
         'pecan': 111680,
         'potato': 111569,
         'soy': 110857,
         'nutmeg': 110276,
         'mayonnaise': 110059,
         'mushroom': 104583,
         'chicken broth': 103901,
         'oregano': 103107,
         'honey': 101815,
         'cumin': 101315,
 