# Pre-Processing Data

In [3]:
# Imports
import pandas as pd
import numpy as np
import pickle as pkl

## Cleaning interactions_train, interactions_validation, interactions_test

In [7]:
def process_data(file):
    # Load data
    df = pd.read_csv(file)

    # Drop unmapped id columns
    df.drop(['user_id','recipe_id'],axis=1,inplace=True)

    # Rename mapped id columns
    df.rename(columns={"u": "user_id","i":"recipe_id"},inplace=True)
    df.insert(0, "user_id", df.pop("user_id"))
    df.insert(1, "recipe_id", df.pop("recipe_id"))

    # Convert 'date' column to datetime format
    df['date'] = pd.to_datetime(df['date'])

    return df

    # Save to csv
    #df.to_csv("data/interactions_train_processed.csv")

In [9]:
# Get processed data
interactions_train_processed = process_data("data/interactions_train.csv")
interactions_validation_processed = process_data("data/interactions_train.csv")
interactions_test_processed = process_data("data/interactions_train.csv")

# Save as csv
interactions_train_processed.to_csv("data/interactions_train_processed.csv")
interactions_validation_processed.to_csv("data/interactions_validation_processed.csv")
interactions_test_processed.to_csv("data/interactions_test_processed.csv")

## Convert ingr_map into csv

In [None]:
# Load ingr_map.pkl as df
df_ingredient_map = pd.read_pickle("data/ingr_map.pkl")

# Only keep ingredient name and ids
df_ingredient_map = df_ingredient_map[["replaced", "id"]]

# Drop duplicates and reset index
df_ingredient_map.drop_duplicates(subset ="replaced", keep = 'first', inplace = True)
df_ingredient_map.reset_index(drop=True, inplace=True)

# Save as csv
df_ingredient_map.to_csv("data/ingredient_map.csv", index=False)

# Display ingredient_map
df_ingredient_map

Unnamed: 0,replaced,id
0,lettuce,4308
1,french vanilla pudding and pie filling mix,2744
2,stove top stuffing mix,6843
3,cream cheese,1910
4,cheddar,1168
...,...,...
8018,soybean,6702
8019,goose,3318
8020,ajwain,47
8021,brinjal,750


## Add ingredient names into PP_recipes

In [3]:
# Load data
ingredient_map = pd.read_csv('data/ingredient_map.csv')
recipes = pd.read_csv('data/PP_recipes.csv')

# Create a dictionary for mapping IDs to ingredient names
id_to_name = dict(zip(ingredient_map['id'], ingredient_map['replaced']))

# Function to replace ingredient IDs with names
def replace_ids_with_names(id_list):
    return [id_to_name.get(ingredient_id, f"Unknown({ingredient_id})") for ingredient_id in eval(id_list)]

# Apply the function to the ingredient_ids column
recipes['ingredient_names'] = recipes['ingredient_ids'].apply(replace_ids_with_names)

# Save the updated recipes to a new CSV file
updated_recipes_path = 'data/PP_recipes_updated.csv'
recipes.to_csv(updated_recipes_path, index=False)

# Display updated recipe data
recipes

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,ingredient_names
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]","[basmati rice, water, salt, cinnamon stick, gr..."
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...","[flmy, oat, brown sugar, pecan, butter, egg, s..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...","[chicken broth, water, salt, black pepper, oni..."
3,74301,168258,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]","[wonton wrapper, hamburger, taco seasoning, sa..."
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]","[ground beef, sausage, velveeta cheese, ature ..."
...,...,...,...,...,...,...,...,...,...
178260,323143,76862,"[40480, 6444, 1964, 9369, 486, 569, 17551, 40481]","[[8780], [11835, 1762, 4465, 31494], [6812], [...","[40480, 40482, 729, 2525, 715, 485, 26641, 404...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[840, 208, 2499, 2683, 1925, 335, 1511]","[butter, artificial sweetener, egg, flmy, crea..."
178261,149114,145962,"[40480, 17027, 24715, 974, 11877, 40481]","[[6812], [5940], [30645, 4785, 6821], [6953], ...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[2499, 4717, 1168, 6270, 6324, 7040]","[egg, milk, cheddar, salt, sausage, syrup]"
178262,34200,65066,"[40480, 12187, 11434, 1738, 2627, 40481]","[[6167, 20930, 510], [1353], [15022, 6953], [6...","[40480, 40482, 500, 246, 1719, 5024, 240, 2366...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",2,"[2378, 7655, 3219, 2320, 5168, 5319, 4189, 268...","[dried thyme, water, garlic salt, dried oregan..."
178263,30618,77358,"[40480, 870, 488, 1325, 519, 2220, 2417, 488, ...","[[12395, 38308, 40118], [3137, 15022], [30878,...","[40480, 40482, 562, 481, 10734, 240, 23667, 58...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[5627, 2807, 5412, 3399, 7979, 1093, 1257, 780...","[pork tenderloin, fresh garlic, pineapple chun..."
