In [11]:
from logging import Logger
from dotenv import load_dotenv
import json
import google.generativeai as genai
import pandas as pd
import os
import numpy as np
import logging
import typing_extensions as typing
import time
from app.utils.stomach import save_json

In [12]:
logger = logging.getLogger(__name__)  # Use a named logger

In [None]:
logging.basicConfig(level=logging.INFO)
RECIPE = "../../app/data/processed/recipe_cleaned.json"
INGREDIENTS = "../../app/data/processed/ingredients_AI.json"
MEAL = "../../app/data/processed/meal_cleaned.json"
MEALTORECIPE = "../../app/data/processed/mealtorecipe.json" 

In [27]:
logger.info("PREPARING DATA")
data: pd.DataFrame = pd.read_json(MEAL)
data = data.replace({np.nan: None, '': None, ' ': None})

INFO:__main__:PREPARING DATA


In [28]:
data["updated_at"] = data["created_at"]

In [29]:
data["status"] = "PUBLISHED"
data["video_url"] = "Not Available"

In [31]:
data = data[["meal_id", "name", "description", "status", "created_at", "updated_at", "source", "video_url"]]
len(data)

220

In [33]:
df_cleaned = data.drop_duplicates()
df_cleaned = df_cleaned.reset_index().drop(columns="index")
len(df_cleaned)

220

In [34]:
import pandas as pd

# Load datasets
meals_df = df_cleaned
recipes_df = pd.read_json(RECIPE)
meal_to_recipe_df = pd.read_json(MEALTORECIPE)
ingredients_df = pd.read_json(INGREDIENTS)

recipes_df = recipes_df.drop_duplicates()
print(len(recipes_df))

meals_df.columns, recipes_df.columns, ingredients_df.columns, meal_to_recipe_df.columns

220


(Index(['meal_id', 'name', 'description', 'status', 'created_at', 'updated_at',
        'source', 'video_url'],
       dtype='object'),
 Index(['meal_type', 'recipe_id', 'alternative_meal_type', 'category',
        'course_type'],
       dtype='object'),
 Index(['main_ingredient', 'quantity', 'weight', 'ingredient_id',
        'alternative_ingredient', 'measurement'],
       dtype='object'),
 Index(['meal_id', 'recipe_id'], dtype='object'))

In [35]:
ingredients_df = ingredients_df.rename(columns={"ingredient_id": "recipe_id"})

In [36]:
#  First merge: meals with recipes
meal_to_recipe = pd.merge(
    meals_df,
    meal_to_recipe_df,
    on='meal_id',  # Assuming this is the common field
    how='left'       # Keep all meals, even if no recipe match
)

print(len(meal_to_recipe))

# Second merge: recipes with ingredients
meal_to_recipe = pd.merge(
    meal_to_recipe,
    recipes_df,
    on='recipe_id',  # Assuming this is the common field
    how='left'       # Keep all recipes, even if no ingredient match
)

# Third merge: recipes with ingredients
final_df = pd.merge(
    meal_to_recipe,
    ingredients_df,
    on='recipe_id',  # Assuming this is the common field
    how='left'       # Keep all ingredients, even if no ingredient match
)

# final_df = final_df.drop_duplicates()

220


In [37]:
len(final_df)

882

In [38]:
final_df.to_csv("merged.csv", index=False)

In [73]:
final_df.columns

Index(['meal_id', 'name', 'description', 'status', 'created_at', 'updated_at',
       'source', 'video_url', 'recipe_id', 'course', 'cuisine', 'servings',
       'calories', 'prep time', 'cook time', 'total time', 'author', 'keyword',
       'soaking time', 'marinating time', 'cooling time', 'resting time',
       'wait time', 'main_ingredient', 'quantity', 'weight',
       'alternative_ingredient', 'measurement'],
      dtype='object')

In [77]:
# break down the datasets into meals table, recipes table and ingredients table
meals = final_df[["meal_id", "name", "description"]]
ingredients = final_df[["meal_id", "recipe_id", "main_ingredient", "alternative_ingredient"]]

In [79]:
import uuid

ingredients["ingredient_id"] = range(1, len(ingredients) + 1)
print(ingredients)


                                  meal_id  \
0    a6213618-a4b0-4b70-a132-a0cf4763bd02   
1    a6213618-a4b0-4b70-a132-a0cf4763bd02   
2    a6213618-a4b0-4b70-a132-a0cf4763bd02   
3    a6213618-a4b0-4b70-a132-a0cf4763bd02   
4    a6213618-a4b0-4b70-a132-a0cf4763bd02   
..                                    ...   
409  34f02b7c-7d18-46c4-abee-ff088272ec6b   
410  151e819a-7534-4598-b7f1-4f532670bb98   
411  7993b368-d196-4bb4-8766-dd1b94b7f175   
412  8fa0abd8-b6d6-4c72-9a11-d1b891b2f09b   
413  c258ef0d-0301-469a-bf2c-e85c2a19c641   

                                recipe_id     main_ingredient  \
0    2c2703b4-ed39-468d-a09b-4dda480e3bc7           plantains   
1    2c2703b4-ed39-468d-a09b-4dda480e3bc7                milk   
2    2c2703b4-ed39-468d-a09b-4dda480e3bc7                lime   
3    2c2703b4-ed39-468d-a09b-4dda480e3bc7  vanilla bean paste   
4    2c2703b4-ed39-468d-a09b-4dda480e3bc7   all purpose flour   
..                                    ...                 ...   
409 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredients["ingredient_id"] = range(1, len(ingredients) + 1)


In [42]:
ingredients_cleaned = pd.read_json(DIRTY_INGREDIENTS)
ingredients_ai = pd.read_json(INGREDIENTS)

print(len(ingredients_cleaned), len(ingredients_ai))

2445 735
