# Pre-Processing Data


In [1]:
# Imports
import pandas as pd
import ast

## Convert ingr_map into csv


In [2]:
# Load ingr_map.pkl as df
df_ingredient_map = pd.read_pickle("data/ingr_map.pkl")

# Only keep ingredient name and ids
df_ingredient_map = df_ingredient_map[["replaced", "id"]]

# Drop duplicates and reset index
df_ingredient_map.drop_duplicates(subset ="replaced", keep = 'first', inplace = True)
df_ingredient_map.reset_index(drop=True, inplace=True)

# Rename columns
df_ingredient_map.rename(columns={"replaced": "ingredient_names", "id": "ingredient_ids"}, inplace=True)

# Fix formatting
df_ingredient_map['ingredient_names'] = df_ingredient_map['ingredient_names'].str.lower().str.strip()

# Display ingredient_map
df_ingredient_map

Unnamed: 0,ingredient_names,ingredient_ids
0,lettuce,4308
1,french vanilla pudding and pie filling mix,2744
2,stove top stuffing mix,6843
3,cream cheese,1910
4,cheddar,1168
...,...,...
8018,soybean,6702
8019,goose,3318
8020,ajwain,47
8021,brinjal,750


## Clean ingredients

In [3]:
# Dictionary of ingredient corrections
ingredient_corrections = {
    's': 'nabisco triscuits',
    'yel': 'yellow sugar',
    'chocolateing': 'chocolate flavoring',
    'tia marium': 'tia maria',
    's quik': 'nestles quik',
    'butt': 'pork butt',
    'pak chous': 'bok choy',
    'piece': "reese's pieces",
    'baby octopu': 'baby octopus',
    'mo glutamate': 'msg',
    'sea bas': 'sea bass',
    'ketjap mani': 'ketjap manis',
    'thin spaghettus': 'thin spaghetti',
    'bananaing': 'banana flavoring',
    'truvium': 'truvia',
    'shichimi-togarashi': 'shichimi togarashi',
    'asparagu': 'asparagus',
    'endife': 'endive'
}

# Replace all ingredient names using a single operation
df_ingredient_map.replace({'ingredient_names': ingredient_corrections}, inplace=True)

In [4]:
# Dictionary of substrings to replace within ingredient names
substring_corrections = {
    'flmy': 'flour',
    'olife': 'olive',
    'chily': 'chili',
    "s cocoa powder": "Hershey's cocoa powder",
    'ature': 'miniature',
    'of fresh mint': 'fresh mint',
    'young roasting chicken': 'roasting chicken',
    "m&m'": 'm&m'
}

# Apply substring corrections efficiently
for old_substring, new_substring in substring_corrections.items():
    if old_substring == 'ature':  # Special case to avoid replacing 'nature'
        df_ingredient_map.loc[
            ~df_ingredient_map['ingredient_names'].str.contains("nature", na=False),
            'ingredient_names'
        ] = df_ingredient_map['ingredient_names'].str.replace(old_substring, new_substring, regex=False)
    else:
        df_ingredient_map['ingredient_names'] = df_ingredient_map['ingredient_names'].str.replace(
            old_substring, new_substring, regex=False
        )

# Replace ingredients with commas to retain only the main ingredient (first part before comma)
df_ingredient_map['ingredient_names'] = df_ingredient_map['ingredient_names'].str.split(',').str[0].str.rstrip('s')

# Drop "'s" at the beginning of ingredient names
df_ingredient_map['ingredient_names'] = df_ingredient_map['ingredient_names'].str.replace(
    r"^'s\s*", '', regex=True
)

In [5]:
# Save as csv
df_ingredient_map.to_csv("data/ingredient_map.csv", index=False)

## Add ingredient names into PP_recipes


In [6]:
# Load data
ingredient_map = pd.read_csv('data/ingredient_map.csv')
recipes = pd.read_csv('data/PP_recipes.csv')
# recipes = recipes.drop(columns=['i'])

# Create a dictionary for mapping IDs to ingredient names
id_to_name = dict(zip(ingredient_map['ingredient_ids'], ingredient_map['ingredient_names']))

# Function to replace ingredient IDs with names
def replace_ids_with_names(id_list):
    return [id_to_name.get(ingredient_id, f"Unknown({ingredient_id})") for ingredient_id in eval(id_list)]

# Apply the function to the ingredient_ids column
recipes['ingredient_names'] = recipes['ingredient_ids'].apply(replace_ids_with_names)

# Display updated recipe data
recipes

Unnamed: 0,id,name_tokens,ingredient_tokens,steps_tokens,techniques,technique_names,calorie_level,ingredient_ids,ingredient_names
0,424415,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","['combine', 'drain', 'strain']",0,"[389, 7655, 6270, 1527, 3406]","[basmati rice, water, salt, cinnamon stick, gr..."
1,146223,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","['bake', 'combine', 'melt', 'pour', 'refrigera...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[flour, oat, brown sugar, pecan, butter, egg, ..."
2,312329,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","['boil', 'crush', 'melt', 'pour', 'simmer']",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[chicken broth, water, salt, black pepper, oni..."
3,74301,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['bake', 'drain', 'simmer']",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]","[wonton wrapper, hamburger, taco seasoning, sa..."
4,76272,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","['combine', 'drain', 'fry']",0,"[3484, 6324, 7594, 243]","[ground beef, sausage, velveeta cheese, miniat..."
...,...,...,...,...,...,...,...,...,...
178260,323143,"[40480, 6444, 1964, 9369, 486, 569, 17551, 40481]","[[8780], [11835, 1762, 4465, 31494], [6812], [...","[40480, 40482, 729, 2525, 715, 485, 26641, 404...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['bake', 'smooth']",1,"[840, 208, 2499, 2683, 1925, 335, 1511]","[butter, artificial sweetener, egg, flour, cre..."
178261,149114,"[40480, 17027, 24715, 974, 11877, 40481]","[[6812], [5940], [30645, 4785, 6821], [6953], ...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['bake', 'pour']",0,"[2499, 4717, 1168, 6270, 6324, 7040]","[egg, milk, cheddar, salt, sausage, syrup]"
178262,34200,"[40480, 12187, 11434, 1738, 2627, 40481]","[[6167, 20930, 510], [1353], [15022, 6953], [6...","[40480, 40482, 500, 246, 1719, 5024, 240, 2366...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","['bake', 'combine', 'pour', 'skillet']",2,"[2378, 7655, 3219, 2320, 5168, 5319, 4189, 268...","[dried thyme, water, garlic salt, dried oregan..."
178263,30618,"[40480, 870, 488, 1325, 519, 2220, 2417, 488, ...","[[12395, 38308, 40118], [3137, 15022], [30878,...","[40480, 40482, 562, 481, 10734, 240, 23667, 58...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","['boil', 'combine', 'drain', 'fry', 'simmer', ...",0,"[5627, 2807, 5412, 3399, 7979, 1093, 1257, 780...","[pork tenderloin, fresh garlic, pineapple chun..."


## Convert techniques into readable format

In [7]:
# List of techniques
TECHNIQUES_LIST = [
    'bake',
    'barbecue',
    'blanch',
    'blend',
    'boil',
    'braise',
    'brine',
    'broil',
    'caramelize',
    'combine',
    'crock pot',
    'crush',
    'deglaze',
    'devein',
    'dice',
    'distill',
    'drain',
    'emulsify',
    'ferment',
    'freez',
    'fry',
    'grate',
    'griddle',
    'grill',
    'knead',
    'leaven',
    'marinate',
    'mash',
    'melt',
    'microwave',
    'parboil',
    'pickle',
    'poach',
    'pour',
    'pressure cook',
    'puree',
    'refrigerate',
    'roast',
    'saute',
    'scald',
    'scramble',
    'shred',
    'simmer',
    'skillet',
    'slow cook',
    'smoke',
    'smooth',
    'soak',
    'sous-vide',
    'steam',
    'stew',
    'strain',
    'tenderize',
    'thicken',
    'toast',
    'toss',
    'whip',
    'whisk',
]

# List of the recipes with their corresponding techniques
technique_names = []

# Populate the list
for i in range(0, len(recipes)):
    technique_in_recipe = ast.literal_eval(recipes['techniques'].array[i])
    technique_name = [item for item, flag in zip(TECHNIQUES_LIST,technique_in_recipe) if flag == 1]
    technique_names.append(technique_name)

# Append into recipes
recipes['technique_names'] = technique_names
recipes.insert(5, 'technique_names', recipes.pop('technique_names'))

# Display
recipes

Unnamed: 0,id,name_tokens,ingredient_tokens,steps_tokens,techniques,technique_names,calorie_level,ingredient_ids,ingredient_names
0,424415,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[combine, drain, strain]",0,"[389, 7655, 6270, 1527, 3406]","[basmati rice, water, salt, cinnamon stick, gr..."
1,146223,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[bake, combine, melt, pour, refrigerate, smoot...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[flour, oat, brown sugar, pecan, butter, egg, ..."
2,312329,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[boil, crush, melt, pour, simmer]",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[chicken broth, water, salt, black pepper, oni..."
3,74301,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[bake, drain, simmer]",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]","[wonton wrapper, hamburger, taco seasoning, sa..."
4,76272,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[combine, drain, fry]",0,"[3484, 6324, 7594, 243]","[ground beef, sausage, velveeta cheese, miniat..."
...,...,...,...,...,...,...,...,...,...
178260,323143,"[40480, 6444, 1964, 9369, 486, 569, 17551, 40481]","[[8780], [11835, 1762, 4465, 31494], [6812], [...","[40480, 40482, 729, 2525, 715, 485, 26641, 404...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[bake, smooth]",1,"[840, 208, 2499, 2683, 1925, 335, 1511]","[butter, artificial sweetener, egg, flour, cre..."
178261,149114,"[40480, 17027, 24715, 974, 11877, 40481]","[[6812], [5940], [30645, 4785, 6821], [6953], ...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[bake, pour]",0,"[2499, 4717, 1168, 6270, 6324, 7040]","[egg, milk, cheddar, salt, sausage, syrup]"
178262,34200,"[40480, 12187, 11434, 1738, 2627, 40481]","[[6167, 20930, 510], [1353], [15022, 6953], [6...","[40480, 40482, 500, 246, 1719, 5024, 240, 2366...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[bake, combine, pour, skillet]",2,"[2378, 7655, 3219, 2320, 5168, 5319, 4189, 268...","[dried thyme, water, garlic salt, dried oregan..."
178263,30618,"[40480, 870, 488, 1325, 519, 2220, 2417, 488, ...","[[12395, 38308, 40118], [3137, 15022], [30878,...","[40480, 40482, 562, 481, 10734, 240, 23667, 58...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[boil, combine, drain, fry, simmer, skillet, t...",0,"[5627, 2807, 5412, 3399, 7979, 1093, 1257, 780...","[pork tenderloin, fresh garlic, pineapple chun..."


In [8]:
# Save the updated recipes to a new CSV file
recipes.to_csv('data/PP_recipes.csv', index=False)

## Cleaning interactions_train, interactions_validation, interactions_test


In [9]:
# Process interactions data
def process_data(file):
    # Load data
    df = pd.read_csv(file)

    # # Load RAW_recipes
    # raw = pd.read_csv('data/RAW_recipes.csv')

    # # Only keep necessary columns (recipe names and ids)
    # raw = raw[['name','id']]

    # # Join 'recipe_id' and 'recipe_name' columns
    # df = pd.merge(df, raw, how='left', left_on='recipe_id', right_on='id')

    # # Drop redundant id columns
    # df.drop(['id'],axis=1,inplace=True)

    # # Rename column
    # df.rename(columns={"name": "recipe_name"},inplace=True)
    # df.insert(2, "recipe_name", df.pop("recipe_name"))

    # # Convert 'date' column to datetime format
    # df['date'] = pd.to_datetime(df['date'])

    # Merge df_interactions_train with PP_recipes_updates to get ingredient_names
    pp_recipes = pd.read_csv("data/PP_recipes.csv")
    df = pd.merge(df, pp_recipes, how='left', left_on='recipe_id', right_on='id')
    df.drop(['id','name_tokens','ingredient_tokens','steps_tokens','techniques','calorie_level','ingredient_ids'],axis=1,inplace=True)

    # Drop duplicates and reset index
    df.drop_duplicates(keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

In [10]:
# Get processed data
interactions_train_processed = process_data("data/interactions_train.csv")
interactions_validation_processed = process_data("data/interactions_validation.csv")
interactions_test_processed = process_data("data/interactions_test.csv")

# Save as csv
interactions_train_processed.to_csv("data/interactions_train.csv", index=False)
interactions_validation_processed.to_csv("data/interactions_validation.csv", index=False)
interactions_test_processed.to_csv("data/interactions_test.csv", index=False)

# Simple recipes dataset

In [2]:
# Load data
pp_recipes = pd.read_csv("data/PP_recipes.csv")
raw_recipes = pd.read_csv("data/RAW_recipes.csv")

# Merge
simple_recipes = pd.merge(pp_recipes, raw_recipes, how='left', left_on='id', right_on='id')

# Drop unncessary columns
simple_recipes.drop(['name_tokens','ingredient_tokens','steps_tokens',
            'techniques','ingredient_ids','contributor_id',
            'submitted','tags','steps','description', 'ingredients'],
            axis=1,
            inplace=True)

# Format
simple_recipes.insert(0, 'name', simple_recipes.pop('name'))
simple_recipes.insert(4, 'n_ingredients', simple_recipes.pop('n_ingredients'))
simple_recipes['ingredient_names'] = simple_recipes['ingredient_names'].apply(ast.literal_eval)

# Initialize lists
calories = []
total_fat = []
sugar = []
sodium = []
protein = []
saturated_fat = []
carbs = []

# Append nutrition values
def get_nutrition(recipe):
    calories.append(recipe[0])
    total_fat.append(recipe[1])
    sugar.append(recipe[2])
    sodium.append(recipe[3])
    protein.append(recipe[4])
    saturated_fat.append(recipe[5])
    carbs.append(recipe[6])

# Get nutrition for each recipe
for index, row in simple_recipes.iterrows():
    get_nutrition(ast.literal_eval(row['nutrition']))

# Convert into pandas columns
simple_recipes['calories (#)'] = calories
simple_recipes['total_fat (%DV)'] = total_fat
simple_recipes['sugar (%DV)'] = sugar
simple_recipes['sodium (%DV)'] = sodium
simple_recipes['protein (%DV)'] = protein
simple_recipes['saturated_fat (%DV)'] = saturated_fat
simple_recipes['carbs (%DV)'] = carbs

# Drop nutrition
simple_recipes.drop('nutrition',axis=1,inplace=True)

# Display
simple_recipes

Unnamed: 0,name,id,technique_names,calorie_level,n_ingredients,ingredient_names,minutes,n_steps,calories (#),total_fat (%DV),sugar (%DV),sodium (%DV),protein (%DV),saturated_fat (%DV),carbs (%DV)
0,aromatic basmati rice rice cooker,424415,"['combine', 'drain', 'strain']",0,5,"[basmati rice, water, salt, cinnamon stick, gr...",61,6,228.2,2.0,2.0,8.0,9.0,1.0,15.0
1,pumpkin pie a la easy,146223,"['bake', 'combine', 'melt', 'pour', 'refrigera...",0,12,"[flour, oat, brown sugar, pecan, butter, egg, ...",55,10,249.4,16.0,92.0,8.0,11.0,27.0,11.0
2,cheesy tomato soup with potatoes,312329,"['boil', 'crush', 'melt', 'pour', 'simmer']",1,15,"[chicken broth, water, salt, black pepper, oni...",25,6,351.3,34.0,15.0,50.0,25.0,70.0,8.0
3,mini tacos,74301,"['bake', 'drain', 'simmer']",0,8,"[wonton wrapper, hamburger, taco seasoning, sa...",15,8,79.7,5.0,2.0,11.0,11.0,7.0,2.0
4,rosemary s hanky panky s,76272,"['combine', 'drain', 'fry']",0,4,"[ground beef, sausage, velveeta cheese, miniat...",20,5,240.7,29.0,9.0,28.0,27.0,42.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178260,sugar free snickerdoodles,323143,"['bake', 'smooth']",1,7,"[butter, artificial sweetener, egg, flour, cre...",23,6,304.1,30.0,0.0,11.0,10.0,60.0,8.0
178261,sausage pancake strata,149114,"['bake', 'pour']",0,7,"[egg, milk, cheddar, salt, sausage, syrup]",70,14,235.9,26.0,3.0,19.0,35.0,37.0,0.0
178262,baked beef patties,34200,"['bake', 'combine', 'pour', 'skillet']",2,14,"[dried thyme, water, garlic salt, dried oregan...",55,15,577.5,51.0,26.0,38.0,84.0,83.0,8.0
178263,good and garlicky sweet and sour pork,30618,"['boil', 'combine', 'drain', 'fry', 'simmer', ...",0,12,"[pork tenderloin, fresh garlic, pineapple chun...",40,13,240.1,5.0,96.0,12.0,41.0,5.0,10.0


In [3]:
# Save to csv
simple_recipes.to_csv("data/simple_recipes.csv", index=False)