# Pre-Processing Data


In [16]:
# Imports
import pandas as pd
import numpy as np

## Cleaning interactions_train, interactions_validation, interactions_test


In [23]:
def process_data(file):
    # Load data
    df = pd.read_csv(file)

    # Load RAW_recipes
    raw = pd.read_csv('data/RAW_recipes.csv')

    # Join 'recipe_id' and 'recipe_name' columns
    df = pd.merge(df, raw, how='left', left_on='recipe_id', right_on='id')

    # Drop unmapped id columns
    df.drop(['u','i'],axis=1,inplace=True)

    # Convert 'date' column to datetime format
    df['date'] = pd.to_datetime(df['date'])

    return df

In [24]:
# Get processed data
interactions_train_processed = process_data("data/interactions_train.csv")
interactions_validation_processed = process_data("data/interactions_validation.csv")
interactions_test_processed = process_data("data/interactions_test.csv")

# Save as csv
interactions_train_processed.to_csv("data/interactions_train_processed.csv", index=False)
interactions_validation_processed.to_csv("data/interactions_validation_processed.csv", index=False)
interactions_test_processed.to_csv("data/interactions_test_processed.csv", index=False)

## Convert ingr_map into csv


In [19]:
# Load ingr_map.pkl as df
df_ingredient_map = pd.read_pickle("data/ingr_map.pkl")

# Only keep ingredient name and ids
df_ingredient_map = df_ingredient_map[["replaced", "id"]]

# Drop duplicates and reset index
df_ingredient_map.drop_duplicates(subset ="replaced", keep = 'first', inplace = True)
df_ingredient_map.reset_index(drop=True, inplace=True)

# Display ingredient_map
df_ingredient_map

In [20]:
# Save as csv
df_ingredient_map.to_csv("data/ingredient_map.csv", index=False)

## Add ingredient names into PP_recipes


In [21]:
# Load data
ingredient_map = pd.read_csv('data/ingredient_map.csv')
recipes = pd.read_csv('data/PP_recipes.csv')
recipes = recipes.drop(columns=['i'])

# Create a dictionary for mapping IDs to ingredient names
id_to_name = dict(zip(ingredient_map['id'], ingredient_map['replaced']))

# Function to replace ingredient IDs with names
def replace_ids_with_names(id_list):
    return [id_to_name.get(ingredient_id, f"Unknown({ingredient_id})") for ingredient_id in eval(id_list)]

# Apply the function to the ingredient_ids column
recipes['ingredient_names'] = recipes['ingredient_ids'].apply(replace_ids_with_names)

# Display updated recipe data
recipes

In [22]:
# Save the updated recipes to a new CSV file
updated_recipes_path = 'data/PP_recipes_updated.csv'
recipes.to_csv(updated_recipes_path, index=False)