In [1]:
from logging import Logger
from dotenv import load_dotenv
import json
import google.generativeai as genai
import pandas as pd
import os
import numpy as np
import logging
import typing_extensions as typing
import time
from app.utils.stomach import save_json

In [2]:
logger = logging.getLogger(__name__)  # Use a named logger

In [4]:
logging.basicConfig(level=logging.INFO)
RECIPE = "../../app/data/processed/recipe_cleaned.json"
INGREDIENTS = "../../app/data/processed/ingredients_AI.json"
MEAL = "../../app/data/processed/meal_cleaned.json"
MEALTORECIPE = "../../app/data/processed/mealtorecipe.json"
RECIPESTEP = "../../app/data/processed/recipestep.json" 

In [5]:
logger.info("PREPARING DATA")
data: pd.DataFrame = pd.read_json(MEAL)
data = data.replace({np.nan: None, '': None, ' ': None})

INFO:__main__:PREPARING DATA


In [6]:
data["updated_at"] = data["created_at"]

In [7]:
data["status"] = "PUBLISHED"
data["video_url"] = "Not Available"

In [8]:
data = data[["meal_id", "name", "description", "status", "created_at", "updated_at", "source", "video_url"]]
len(data)

220

In [9]:
df_cleaned = data.drop_duplicates(subset=["name", "description"])
df_cleaned = df_cleaned.reset_index().drop(columns="index")
len(df_cleaned)

112

In [30]:
import pandas as pd

# Load datasets
meals_df = df_cleaned
recipes_df = pd.read_json(RECIPE)
meal_to_recipe_df = pd.read_json(MEALTORECIPE)
ingredients_df = pd.read_json(INGREDIENTS)
recipe_step_df = pd.read_json(RECIPESTEP)

recipes_df = recipes_df.drop_duplicates()
print(len(recipes_df))

meals_df.columns, recipes_df.columns, ingredients_df.columns, meal_to_recipe_df.columns, recipe_step_df.columns

220


(Index(['meal_id', 'name', 'description', 'status', 'created_at', 'updated_at',
        'source', 'video_url'],
       dtype='object'),
 Index(['recipe_id', 'course', 'cuisine', 'servings', 'calories', 'prep time',
        'cook time', 'total time', 'author', 'keyword', 'soaking time',
        'marinating time', 'cooling time', 'resting time', 'wait time'],
       dtype='object'),
 Index(['main_ingredient', 'quantity', 'weight', 'ingredient_id',
        'alternative_ingredient', 'measurement'],
       dtype='object'),
 Index(['meal_id', 'recipe_id'], dtype='object'),
 Index(['recipe_id', 'serial', 'steps', 'id'], dtype='object'))

In [31]:
ingredients_df = ingredients_df.rename(columns={"ingredient_id": "recipe_id"})

In [53]:
recipe_step_to_meal = pd.merge(
    recipe_step_df,
    meal_to_recipe_df,
    on='recipe_id',
    how='left'
)

recipe_step_to_meal = recipe_step_to_meal.drop(columns=["recipe_id"])
recipe_step_to_meal = recipe_step_to_meal.drop_duplicates()


In [54]:
#  First merge: meals with recipes
meal_to_recipe = pd.merge(
    meals_df,
    meal_to_recipe_df,
    on='meal_id',  # Assuming this is the common field
    how='inner'   ,     # Keep all meals, even if no recipe match
)

# print(meal_to_recipe.head(5))
print(len(meal_to_recipe))

# # Second merge: recipes with ingredients
meal_to_recipe = pd.merge(
    meal_to_recipe,
    recipes_df,
    on='recipe_id',  # Assuming this is the common field
    how='inner'       # Keep all recipes, even if no ingredient match
)

len(meal_to_recipe)
# # Third merge: recipes with ingredients
final_df = pd.merge(
    meal_to_recipe,
    ingredients_df,
    on='recipe_id',  # Assuming this is the common field
    how='inner'       # Keep all ingredients, even if no ingredient match
)

final_df = pd.merge(
    final_df,
    recipe_step_to_meal,
    on='meal_id',
    how='left'
)

len(final_df)
# final_df = final_df.drop_duplicates()

112


3145

In [55]:
print(final_df.columns)

Index(['meal_id', 'name', 'description', 'status', 'created_at', 'updated_at',
       'source', 'video_url', 'recipe_id', 'course', 'cuisine', 'servings',
       'calories', 'prep time', 'cook time', 'total time', 'author', 'keyword',
       'soaking time', 'marinating time', 'cooling time', 'resting time',
       'wait time', 'main_ingredient', 'quantity', 'weight',
       'alternative_ingredient', 'measurement', 'serial', 'steps', 'id'],
      dtype='object')


In [57]:
final_df.to_csv("merged.csv", index=False)

In [77]:
# break down the datasets into meals table, recipes table and ingredients table
meals = final_df[["meal_id", "name", "description"]]
ingredients = final_df[["meal_id", "recipe_id", "main_ingredient", "alternative_ingredient"]]

In [79]:
import uuid

ingredients["ingredient_id"] = range(1, len(ingredients) + 1)
print(ingredients)


                                  meal_id  \
0    a6213618-a4b0-4b70-a132-a0cf4763bd02   
1    a6213618-a4b0-4b70-a132-a0cf4763bd02   
2    a6213618-a4b0-4b70-a132-a0cf4763bd02   
3    a6213618-a4b0-4b70-a132-a0cf4763bd02   
4    a6213618-a4b0-4b70-a132-a0cf4763bd02   
..                                    ...   
409  34f02b7c-7d18-46c4-abee-ff088272ec6b   
410  151e819a-7534-4598-b7f1-4f532670bb98   
411  7993b368-d196-4bb4-8766-dd1b94b7f175   
412  8fa0abd8-b6d6-4c72-9a11-d1b891b2f09b   
413  c258ef0d-0301-469a-bf2c-e85c2a19c641   

                                recipe_id     main_ingredient  \
0    2c2703b4-ed39-468d-a09b-4dda480e3bc7           plantains   
1    2c2703b4-ed39-468d-a09b-4dda480e3bc7                milk   
2    2c2703b4-ed39-468d-a09b-4dda480e3bc7                lime   
3    2c2703b4-ed39-468d-a09b-4dda480e3bc7  vanilla bean paste   
4    2c2703b4-ed39-468d-a09b-4dda480e3bc7   all purpose flour   
..                                    ...                 ...   
409 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredients["ingredient_id"] = range(1, len(ingredients) + 1)


In [42]:
ingredients_cleaned = pd.read_json(DIRTY_INGREDIENTS)
ingredients_ai = pd.read_json(INGREDIENTS)

print(len(ingredients_cleaned), len(ingredients_ai))

2445 735


In [4]:
! pip install openpyxl
import pandas as pd


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
import os
import pandas as pd

os.mkdir("../data/processed/final_csvs")
relationship_data = pd.ExcelFile("../data/processed/DigestDB.xlsx")

for sheet_name in relationship_data.sheet_names:
    data = pd.read_excel("../data/processed/DigestDB.xlsx", sheet_name=sheet_name)
    data.to_csv(f"../data/processed/final_csvs/{sheet_name}.csv", index=False)
    
    print(f"Saved {sheet_name}.csv")
        

Saved AlternativeIngredients.csv
Saved Category.csv
Saved CategoryToMeal.csv
Saved Cuisines.csv
Saved CuisineToMeal.csv
Saved Ingredient.csv
Saved Meal.csv
Saved RecipeStep.csv
Saved Measurements.csv
Saved Recipe.csv
Saved RecipeIngredient.csv


False
CategoryToMeal.csv 57
Saved CategoryToMeal.csv
Meal.csv 33
Saved Meal.csv
CuisineToMeal.csv 56
Saved CuisineToMeal.csv
Measurements.csv 35
Saved Measurements.csv
Cuisines.csv 16
Saved Cuisines.csv
Category.csv 14
Saved Category.csv
RecipeStep.csv 288
Saved RecipeStep.csv
Ingredient.csv 191
Saved Ingredient.csv
AlternativeIngredients.csv 26
Saved AlternativeIngredients.csv
RecipeIngredient.csv 335
Saved RecipeIngredient.csv
Recipe.csv 33
Saved Recipe.csv
