We want to user the combined consumption user data and group it into meals, while using the portion weight conversion to use natural units in our output table

In [59]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [60]:
import pandas as pd
import numpy as np
import csv
import json
import os
from openai import OpenAI
from dotenv import load_dotenv
from io import StringIO

In [61]:
load_dotenv(verbose=True)

# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
print(os.getenv("OPENAI_API_KEY"))

sk-CPhZAdAE6U8i_gAmTQTgCEXiTNo76kHjjUcJGHrBW5T3BlbkFJcP17XqximvPPzxON2qWXlZB11ON22Ng316YOJmLwoA


In [62]:
combined_df = pd.read_csv('combined_consumption_user_with_days_and_food_name.csv')
with open('Portion Weight Conversion/food_unit_conversion_with_embeddings.json', 'r') as file:
    data = json.load(file)
conversion_df = pd.DataFrame(data)

In [63]:
def create_grouped_meals_csv():
    # Define the column names
    columns = ['Meal Description', 'Meal Name', 'Serving Sizes', 'Meal Type', 'Country', 'Source', 'Weight', 'Carb', 'Protein', 'Fat', 'Fiber', 'Calories']

    # Create the CSV file and write the header
    with open('grouped_meals_final.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(columns)

        # You can add some sample data here if you want
        # For example:
        # writer.writerow(['Delicious pasta dish', 'Spaghetti Bolognese', '1 plate', 'Dinner', 'Italy', 'Traditional', '65g'])

    print("CSV file 'grouped_meals.csv' has been created with the specified columns.")

In [64]:
if not os.path.exists('grouped_meals_final.csv'):
    create_grouped_meals_csv()


In [65]:
meal_type_dict = {
    1: "Before breakfast",
    2: "Breakfast",
    3: "Snack or drink between breakfast and lunch",
    4: "Lunch",
    5: "Snack or drink between lunch and dinner",
    6: "Dinner",
    7: "Snack or drink after dinner",
    8: "Snack or drink (unspecified when)"
}

In [66]:
def parse_table_to_dict(table_string):
    # Parse the table using pandas
    df = pd.read_csv(StringIO(table_string), sep='|', skipinitialspace=True).iloc[1:]
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df.columns = df.columns.str.strip()
    df = df.dropna(axis=1, how='all')

    # Convert to a dictionary
    dish_dict = dict(zip(df['Dish'], df['Ingredients']))

    # If you want to split the ingredients into a list
    dish_dict_list = {dish: ingredients.split(', ') if isinstance(ingredients, str) else [] for dish, ingredients in dish_dict.items()}

    return dish_dict_list


In [67]:
def round_to_fraction(value):
    quarters = round(value * 4) / 4
    if quarters == 0:
        return "0"  # or any other appropriate term for very small amounts
    elif quarters < 1:
        if quarters == 0.25:
            return "one fourth"
        elif quarters == 0.5:
            return "one half"
        elif quarters == 0.75:
            return "three fourths"
        else:   
            return f"{quarters:.2f}".rstrip('0').rstrip('.')
    else:
        whole = int(quarters)
        fraction = quarters - whole
        if fraction == 0:
            return f"{whole}"
        elif fraction == 0.25:
            return f"{whole} 1/4"
        elif fraction == 0.5:
            return f"{whole} 1/2"
        elif fraction == 0.75:
            return f"{whole} 3/4"
        else:
            return f"{quarters:.2f}"
        
def calculate_nutrition_similarity(food1, food2):
    nutrients = ['ENERGY_kcal', 'PROTEIN_g', 'FAT_g', 'CARBOH_g']
    similarity = 0
    for nutrient in nutrients:
        if nutrient in food1 and nutrient in food2:
            similarity += 1 - abs(food1[nutrient] - food2[nutrient]) / max(food1[nutrient], food2[nutrient])
    return similarity / len(nutrients)

In [68]:
from sklearn.metrics.pairwise import cosine_similarity

def portion_weight_conversion(meal_descriptions, meal_weights):
    # Assuming conversion_df is already defined and has columns: 'grams', 'natural_unit', 'quantity', and 'embedding'
    
    def get_meal_embedding(meal):
        try:
            response = client.embeddings.create(
                input=json.dumps(meal),  # Convert to JSON string
                model="text-embedding-ada-002"
            )
            return np.array(response.data[0].embedding).reshape(1, -1)
        except Exception as e:
            print(f"Error getting embedding for meal '{meal}': {str(e)}")
            return None
    
    meal_embeddings = [get_meal_embedding(meal) for meal in meal_descriptions]
    
    portion_sizes = []
    for meal, meal_embed, meal_weight in zip(meal_descriptions, meal_embeddings, meal_weights):
        if meal_embed is None:
            continue
        # Calculate cosine similarity
        similarities = conversion_df['embedding'].apply(lambda x: cosine_similarity(meal_embed, np.array(x).reshape(1, -1))[0][0])
        
        # Find the index of the most similar row
        closest_match_index = similarities.idxmax()
        closest_match = conversion_df.loc[closest_match_index]

        print(closest_match['food_description'], meal)

        # Handle null quantity
        quantity = closest_match['quantity']

        if pd.isna(quantity):
            quantity = 1
        
        # Calculate the portion size in natural units
        conversion_factor = quantity / closest_match['grams']
        natural_unit_portion = meal_weight * conversion_factor

        natural_unit_portion = round_to_fraction(natural_unit_portion)

        clean_natural_unit = re.sub(r'\d+', '', closest_match['natural_unit']).strip()
        
        if natural_unit_portion == "0":
            portion_sizes.append(f"a pinch")
        else:
            portion_sizes.append(f"{natural_unit_portion} {clean_natural_unit}")
    
    return portion_sizes

In [69]:
import re

def clean_ingredient(ingredient):
    # Convert to lowercase
    ingredient = ingredient.lower()
    # Remove extra whitespace
    ingredient = re.sub(r'\s+', ' ', ingredient).strip()
    # Remove any non-alphanumeric characters except spaces
    ingredient = re.sub(r'[^a-z0-9 ]', '', ingredient)
    return ingredient

In [70]:
# Group the DataFrame
grouped_df = combined_df.groupby(['SUBJECT', 'WEEK_DAY', 'MEAL_NAME'])
# print(grouped_df.first())

# Process the grouped data
for (subject, week_day, meal_name), group in grouped_df:

    # Meal_type convert into breakfast,lunch,dinner,etc. 
    meal_type = group['MEAL_NAME'].iloc[0]
    meal_type_name = meal_type_dict[meal_name]

    # Full Meal Ingredients List (Cleaned)
    ingredients = [ingredient.replace('\t', ' ').strip() for ingredient in group['INGREDIENT_ENG'].tolist()]
    ingredients = [clean_ingredient(ingredient) for ingredient in ingredients]

    # Create prompt for GPT to identify each dish from the meal based on ingredients, country, and meal type
    prompt = f"You are a helpful assistant that converts ingredient lists into dish names. Given the following ingredients, create a table of dishes commonly consumed in Kenya. Ensure that each dish uses unique ingredients, meaning no ingredient appears in more than one dish. Provide a table with two columns: 'Dish' and 'Ingredients' and nothing else. The ingredients do not need to be part of traditional Kenyan dishes, but they were consumed in Kenya. \n \n Ingredients: {ingredients} \n Consumted at: {meal_type_dict[meal_name]}"
    serving_sizes = group['FOOD_AMOUNT_REPORTED'].tolist()
    #print(prompt)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    meal_name = response.choices[0].message.content.strip()

    # Separate the meal into separate dishes and then parse the ingredients from each dish
    parsed_meal_name = parse_table_to_dict(meal_name)
    meal_description = list(parsed_meal_name.keys()) # List of dish names
    meal_recipe = [[clean_ingredient(ingredient) for ingredient in recipe] for recipe in parsed_meal_name.values()] # List of ingredients for each dish
    source = 'WHO'
    country = 'Kenya'

    meal_weights = np.zeros(len(meal_description)) # Initialize the weight of each dish to 0
    meal_carbs = np.zeros(len(meal_description))
    meal_protein = np.zeros(len(meal_description))
    meal_fat = np.zeros(len(meal_description))
    meal_fiber = np.zeros(len(meal_description))
    meal_calories = np.zeros(len(meal_description))

    # Add ingredients to the respective dishes
    for ind, ingredient in enumerate(ingredients):
        for meal_index, meal in enumerate(meal_recipe):
            if ingredient in meal:
                meal_weights[meal_index] += serving_sizes[ind]
                meal_carbs[meal_index] += group['CARBOH_g'].iloc[ind]
                meal_protein[meal_index] += group['PROTEIN_g'].iloc[ind]
                meal_fat[meal_index] += group['FAT_g'].iloc[ind]
                meal_fiber[meal_index] += group['FIBTG_g'].iloc[ind]
                meal_calories[meal_index] += group['ENERGY_kcal'].iloc[ind]

    portion_weights = portion_weight_conversion(meal_description, meal_weights)

    meal_carbs_list = meal_carbs.tolist()
    meal_protein_list = meal_protein.tolist()
    meal_fat_list = meal_fat.tolist()
    meal_fiber_list = meal_fiber.tolist()
    meal_calories_list = meal_calories.tolist()

    # Prepare data for writing to CSV
    row_data = {
        'Meal Description': meal_description,
        'Meal Name': meal_recipe,
        'Serving Sizes': portion_weights,
        'Meal Type': meal_type_name,
        'Country': country,
        'Source': source,
        'Weight': meal_weights,
        'Carb': meal_carbs_list,
        'Protein': meal_protein_list,
        'Fat': meal_fat_list,
        'Fiber': meal_fiber_list,
        'Calories': meal_calories_list
    }

    # Append the row to the CSV file
    with open('grouped_meals_final.csv', 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=['Meal Description', 'Meal Name', 'Serving Sizes', 'Meal Type', 'Country', 'Source', 'Weight', 'Carb', 'Protein', 'Fat', 'Fiber', 'Calories'])
        if f.tell() == 0:  # If file is empty, write header
            writer.writeheader()
        writer.writerow(row_data)
    # print(subject, week_day, meal_name, total_carbs, serving_sizes, group['INGREDIENT_ENG'].tolist())


print("Grouped data has been processed and appended to 'grouped_meals.csv'.")

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Tea, hot, with milk Tea with Milk


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Rice pilaf Pilau


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Gizzard Githeri
Cucumber and vegetable namasu Sukuma Wiki
Fufu Ugali
Lomi salmon Nyama Choma
Bread, chappatti or roti Chapati


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Coffee and chicory, brewed Kenyan Chai


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Fufu Ugali
Moose Maziwa Lala


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Milk, condensed, sweetened Sweetened Milk


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Fufu Ugali
Cucumber and vegetable namasu Sukuma Wiki
Beans, liquid from stewed kidney beans Bean Stew


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Fufu Ugali
Cucumber and vegetable namasu Sukuma Wiki
Vegetarian stew Vegetable Stew


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Tea, iced, brewed, green, pre-sweetened with sugar Sweet Tea


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Fufu Ugali
Cucumber and vegetable namasu Sukuma Wiki
Seaweed soup Ewedu Soup


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


SILK Chai, soymilk Chai


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


KeyboardInterrupt: 