<a href="https://colab.research.google.com/github/AEStoa/ELI/blob/main/GF_C02e_Unstructured_to_Structured.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -m spacy download en_core_web_md


In [None]:
pip install word2number

In [31]:
import spacy
import pandas as pd
import requests
from io import StringIO
from sklearn.metrics.pairwise import cosine_similarity
import re
import csv
from word2number import w2n
from fractions import Fraction
import nltk
from nltk.tokenize import word_tokenize

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Raw GitHub URLs
github_features_url = 'https://raw.githubusercontent.com/AEStoa/ELI/main/FeatureSets.csv'
github_input_url = 'https://raw.githubusercontent.com/AEStoa/ELI/main/InputRecipe.csv'
github_database_url = 'https://raw.githubusercontent.com/AEStoa/ELI/main/RecipeDatabase2.csv'
github_c02e_url = 'https://raw.githubusercontent.com/AEStoa/ELI/main/C02eDatabase1.csv'


def load_database(database_url):
    response = requests.get(database_url)
    database = {}
    if response.status_code == 200:
        csv_content = response.text.splitlines()
        csv_reader = csv.reader(csv_content)
        next(csv_reader)  # Skip the first row (headers)
        for row in csv_reader:
            ingredient, co2e_value = row
            database[ingredient.lower()] = float(co2e_value)
    return database

# Read the feature sets CSV file from GitHub
response_features = requests.get(github_features_url)
df_features = pd.read_csv(StringIO(response_features.text))

response_c02e = requests.get(github_c02e_url)
df_c02e = pd.read_csv(StringIO(response_c02e.text))

# Read the input recipe CSV file from GitHub
response_input = requests.get(github_input_url)
df_input = pd.read_csv(StringIO(response_input.text))

# Read the database CSV file from GitHub
response_database = requests.get(github_database_url)
df_database = pd.read_csv(StringIO(response_database.text))

# Display a list of available recipes for user selection
print("Available recipes:")
for i, recipe_name in enumerate(df_input['Recipe']):
    print(f"{i + 1}. {recipe_name}")

# Ask the user to select a recipe
selected_index = int(input("Enter the number corresponding to the recipe you want to check: ")) - 1
selected_recipe_text = df_input.iloc[selected_index]['Description'].lower()  # Convert text to lowercase

# Print out the selected input recipe
print(f"\nSelected Input Recipe:\n{selected_recipe_text}")

# Function to preprocess text and create embeddings
def preprocess_and_embed(text):
    if isinstance(text, str):
        doc = nlp(text.lower())  # Convert text to lowercase
        return doc.vector
    else:
        return None  # Return None if the text is not a valid string

# Check for base match with the input recipe
meat_keywords = ['chicken', 'pork', 'beef', 'turkey', 'shrimp']
plant_keywords = ['plant-based', 'plant-base', 'plant', 'meat-like', 'soy', 'soy-based']

# Check if any plant-related keyword is present in the selected recipe
is_plant_recipe = any(keyword in selected_recipe_text for keyword in plant_keywords)

# Check if any meat-related keyword is present in the selected recipe
is_meat_recipe = any(keyword in selected_recipe_text for keyword in meat_keywords)

# Determine the input recipe base
input_recipe_base = 'plant' if is_plant_recipe else 'meat' if is_meat_recipe else 'unknown'

print(f"\nInput Recipe Base: {input_recipe_base}\n")

# Function to find the recipes that match keywords and base in the feature set
def find_matching_recipes(query_text, feature_set, database):
    matching_recipes = []

    # Extract keywords from the query_text
    keywords = set()
    for token in nlp(query_text):
        if token.text in feature_set['Keyword'].str.lower().values:
            keywords.add(token.text)

    # Filter the database based on identified keywords
    matching_rows = feature_set[feature_set['Keyword'].str.lower().isin(keywords)]

    # Retrieve corresponding ingredients from the database
    for idx, row in matching_rows.iterrows():
        matching_recipes.append({
            'Recipe': row['Recipe'],
            'Ingredients': row['Ingredients'],
            'Base': row['Base']
        })

    return matching_recipes

# Function to filter recipes based on the identified base and flavor keyword
def filter_recipes_by_base_and_flavor(recipes, input_recipe_base):
    return [recipe for recipe in recipes if recipe['Base'].lower() == input_recipe_base]


def convert_common_fractions(text):
    common_fractions = {
        'half': '0.5',
        'quarter': '0.25',
        'three quarter': '0.75'
        # Add more common fractions as needed
    }
    for fraction, value in common_fractions.items():
        text = text.replace(fraction, value)
    return text

def convert_fractions_to_decimals(text):
    # Function to convert fractions to decimals
    fraction_pattern = r'(\d+)/(\d+)'

    def replace_fraction(match):
        numerator, denominator = map(int, match.groups())
        decimal_value = numerator / denominator
        return str(decimal_value)

    cleaned_text = re.sub(fraction_pattern, replace_fraction, text)
    return cleaned_text

def convert_written_numbers(text):
    # Convert written-out numbers to numeric values
    words = text.split()
    for i, word in enumerate(words):
        try:
            numeric_value = w2n.word_to_num(word)
            words[i] = str(numeric_value)
        except ValueError:
            pass  # Ignore words that are not written-out numbers
    return ' '.join(words)

def break_numbers_from_letters(text):
    pattern = r'(\d+)([a-zA-Z]+)'

    def separate_numbers_letters(match):
        return match.group(1) + ' ' + match.group(2)

    cleaned_text = re.sub(pattern, separate_numbers_letters, text)
    return cleaned_text

def remove_non_alphanumeric_except_periods(text):
    # Remove all non-alphanumeric characters except periods
    cleaned_text = re.sub(r'[^\w.]', ' ', text)
    return cleaned_text

def remove_punctuation_except_periods(text):
    # Remove all periods except those next to or inside numbers
    cleaned_text = re.sub(r'(?<!\d)\.(?!\d)', '', text)
    return cleaned_text

def tokenize_text(cleaned_text, database):
    # Tokenize the cleaned text while preserving decimals and multi-word ingredients
    words = re.findall(r'\b\d+\.\d+\b|\b\w+\b', cleaned_text)
    tokens = []
    i = 0

    while i < len(words):
        is_multi_word = False
        for j in range(i, len(words)):
            phrase = ' '.join(words[i:j + 1])
            if phrase.lower() in database:
                tokens.append(phrase.lower())
                i = j + 1
                is_multi_word = True
                break
        if not is_multi_word:
            tokens.append(words[i].lower())
            i += 1

    return tokens


# Define the unit dictionary outside of the function
UNIT_DICTIONARY = ["g", "gram", "grams",
                   "kg", "kilogram", "kilograms",
                   "mg", "milligram", "milligrams",
                   "oz", "ounce", "ounces",
                   "lb", "pound", "pounds",
                   "ml", "milliliter", "milliliters",
                   "l", "liter", "liters",
                   "tsp", "teaspoon", "teaspoons",
                   "tbsp", "tablespoon", "tablespoons",
                   "cup", "cups",
                   "pt", "pint", "pints",
                   "qt", "quart", "quarts",
                   "gal", "gallon", "gallons"]

def filter_valid_words(words, database):
    filtered_words = []
    i = 0
    while i < len(words):
        is_multi_word = False
        for j in range(i, len(words)):
            phrase = ' '.join(words[i:j + 1])
            if phrase.lower() in database['Ingredients'].str.lower().values:
                filtered_words.append(phrase.lower())
                i = j + 1
                is_multi_word = True
                break
        if not is_multi_word:
            # Check if the word is a number/decimal or a valid unit
            if re.match(r'^\d+(\.\d+)?$', words[i]):
                # If the word is a number/decimal, consider it as a quantity
                filtered_words.append(words[i])
            elif words[i].lower() in UNIT_DICTIONARY:
                # If the word is a valid unit, consider it as a unit
                filtered_words.append(words[i].lower())
            else:
                # If the word is not an ingredient, quantity, or valid unit, exclude it
                pass
            i += 1
    return filtered_words


def add_default_quantity(words):
    cleaned_words = []
    i = 0
    while i < len(words):
        # Check if the word is a unit and if there is no number in front of it
        if words[i] in UNIT_DICTIONARY and (i == 0 or not re.match(r'^\d+(\.\d+)?$', words[i - 1])):
            # Add default quantity of 1 in front of the unit
            cleaned_words.append("1")
        cleaned_words.append(words[i])
        i += 1
    return cleaned_words

def group_into_tuples(words):
    # Group words into tuples of 3
    return list(zip(*[iter(words)] * 3))

def reorder_tuples(tuples):
    reordered_tuples = []
    for tpl in tuples:
        if tpl[1].replace('.', '', 1).isdigit():  # Check if quantity is a number
            # If quantity is a number, assume the order is [ingredient, quantity, unit]
            reordered_tuples.append((tpl[0], tpl[1], tpl[2]))
        else:
            # If quantity is not a number, assume the order is [quantity, unit, ingredient]
            reordered_tuples.append((tpl[2], tpl[0], tpl[1]))
    return reordered_tuples

def add_labels_to_tuples(tuples):
    return tuples
    #labeled_tuples = []
    #for tpl in tuples:
    #    ingredient_label = "Ingredient:"
    #    quantity_label = "Quantity:"
    #    unit_label = "Unit:"
    #    labeled_tuples.append((ingredient_label, tpl[0], quantity_label, tpl[1], unit_label, tpl[2]))
    #return labeled_tuples

def convert_to_grams(tuples):
    converted_tuples = []
    for ingredient, quantity, unit in tuples:
        quantity = float(quantity)
        unit = unit.lower()

        # Convert various units to grams for consistent calculations
        if unit in ['g', 'gram', 'grams']:
            # Quantity is already in grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['kg', 'kilogram', 'kilograms']:
            quantity *= 1000  # Convert kilograms to grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['mg', 'milligram', 'milligrams']:
            quantity /= 1000  # Convert milligrams to grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['oz', 'ounce', 'ounces']:
            quantity *= 28.3495  # Convert ounces to grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['lb', 'pound', 'pounds']:
            quantity *= 453.592  # Convert pounds to grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['tbsp', 'tablespoon', 'tablespoons']:
            quantity *= 14.7868  # Convert tablespoons to grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['tsp', 'teaspoon', 'teaspoons']:
            quantity *= 4.92892  # Convert teaspoons to grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['ml', 'milliliter', 'milliliters']:
            # For milliliters, assuming 1 ml is approximately 1 gram
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['l', 'liter', 'liters']:
            quantity *= 1000  # Convert liters to grams
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['cup', 'cups']:
            # Convert cups to grams (assuming common ingredients like water/flour)
            # 1 cup is approximately 240 grams
            quantity *= 240
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['pt', 'pint', 'pints']:
            # 1 pint is approximately 473.176 grams
            quantity *= 473.176
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['qt', 'quart', 'quarts']:
            # 1 quart is approximately 946.353 grams
            quantity *= 946.353
            converted_tuples.append((ingredient, quantity, 'g'))
        elif unit in ['gal', 'gallon', 'gallons']:
            # 1 gallon is approximately 3785.41 grams
            quantity *= 3785.41
            converted_tuples.append((ingredient, quantity, 'g'))
        else:
            # If unit is not recognized, keep the original unit
            converted_tuples.append((ingredient, quantity, unit))
    return converted_tuples

def calculate_total_co2e(ingredient_matches, data):
    total_co2e = 0
    for ingredient, quantity, unit in ingredient_matches:
        quantity = float(quantity)
        cleaned_word = ingredient.strip().lower()
        co2e_value = data.get(cleaned_word, 0)  # Get CO2e value for the ingredient
        ingredient_co2e = quantity * co2e_value
        total_co2e += ingredient_co2e  # Multiply quantity with CO2e value and add to total

        # Print CO2e information for each ingredient
        print(f"Ingredient: {cleaned_word.capitalize()}, Quantity: {quantity}g, Unit: {unit.capitalize()}, CO2e Value: {co2e_value}")
        print(f"CO2e for {cleaned_word.capitalize()}: {ingredient_co2e}")

    return total_co2e

# Function to compare embedded recipe with the database and return top 3 matches with recipe text
def compare_and_return_top_matches_with_text(embedded_recipe, database, input_recipe_base, top_n=3):
    similarities = {}

    for idx, row in database.iterrows():
        recipe_name = row['Recipe']
        recipe_base = row['Base'].lower()

        # Check if the recipe base matches the input recipe base
        if recipe_base == input_recipe_base:
            recipe_text = row['Ingredients']
            recipe_embedding = preprocess_and_embed(recipe_text)

            # Calculate cosine similarity based on embeddings
            if recipe_embedding is not None:
                embedding_similarity = cosine_similarity(
                    [embedded_recipe],
                    [recipe_embedding]
                )[0][0]

                similarities[recipe_name] = {
                    'Similarity': embedding_similarity,
                    'Text': recipe_text
                }

    # Sort the recipes by embedding similarity in descending order
    sorted_recipes = sorted(similarities.items(), key=lambda x: x[1]['Similarity'], reverse=True)

    # Return the top n recipes with recipe text
    top_matches = [(recipe_name, info['Similarity'], info['Text']) for recipe_name, info in sorted_recipes[:top_n]]
    return top_matches

# Function to compare embedded recipe with the database and return top 3 matches
def compare_and_return_top_matches(embedded_recipe, database, input_recipe_base, top_n=3):
    similarities = {}

    for idx, row in database.iterrows():
        recipe_name = row['Recipe']
        recipe_base = row['Base'].lower()

        # Check if the recipe base matches the input recipe base
        if recipe_base == input_recipe_base:
            recipe_embedding = preprocess_and_embed(row['Ingredients'])

            # Calculate cosine similarity based on embeddings
            if recipe_embedding is not None:
                embedding_similarity = cosine_similarity(
                    [embedded_recipe],
                    [recipe_embedding]
                )[0][0]

                similarities[recipe_name] = {
                    'Similarity': embedding_similarity,
                    'Text': row['Ingredients']
                }

    # Sort the recipes by embedding similarity in descending order
    sorted_recipes = sorted(similarities.items(), key=lambda x: x[1]['Similarity'], reverse=True)

    # Return the top n recipes with recipe text
    top_matches = [(recipe_name, info['Similarity'], info['Text']) for recipe_name, info in sorted_recipes[:top_n]]
    return top_matches

# Display the matching and filtered recipes
matching_recipes = find_matching_recipes(selected_recipe_text, df_features, df_database)
filtered_recipes = filter_recipes_by_base_and_flavor(matching_recipes, input_recipe_base)


if filtered_recipes:
    print("\nRecipes matching both base and flavor keyword:")
    for recipe_info in filtered_recipes:
        print(f"- Recipe: {recipe_info['Recipe']}\n  Ingredients: {recipe_info['Ingredients']}\n  Base: {recipe_info['Base']}\n")

    # Compare the embedded recipe with the filtered database and return top 3 matches
    embedded_recipe = preprocess_and_embed(selected_recipe_text)
    top_matches = compare_and_return_top_matches(embedded_recipe, df_database, input_recipe_base)

    if top_matches:
        print("\nTop 3 Matching Recipes:")
        for rank, (recipe_name, similarity, recipe_text) in enumerate(top_matches, 1):
            print(f"{rank}. Recipe: {recipe_name}, Similarity: {similarity}\n   Text: {recipe_text}\n")
    else:
        print("\nNo matching recipes found in the filtered database.")
else:
    print("\nNo recipes matching both base and flavor keyword.")

# Extract the recipe text from the top 3 matching recipes
top_recipe_texts = [recipe_text for _, _, recipe_text in top_matches]

def calculate_total_co2e(ingredient_matches, data):
    total_co2e = 0
    for ingredient, quantity, unit in ingredient_matches:
        quantity = float(quantity)
        cleaned_word = ingredient.strip().lower()

        # Check if the cleaned word exactly matches an ingredient in the database
        match = data[data['Ingredients'].str.lower() == cleaned_word]

        if not match.empty:
            co2e_value = match.iloc[0]['Emission Factor (g C02e/g of Ingredient)']
            ingredient_co2e = quantity * co2e_value
            total_co2e += ingredient_co2e  # Multiply quantity with CO2e value and add to total

            # Print CO2e information for each ingredient
            print(f"Ingredient: {cleaned_word.capitalize()}, Quantity: {quantity}g, Unit: {unit.capitalize()}, CO2e Value: {co2e_value}")
            print(f"CO2e for {cleaned_word.capitalize()}: {ingredient_co2e}")
        else:
            print(f"Ingredient: {cleaned_word.capitalize()} not found in the database.")

    return total_co2e



# Run the cleaning functions for each top recipe text
cleaned_recipe_texts = []
for idx, top_recipe_text in enumerate(top_recipe_texts, 1):
    # Apply the cleaning functions in a consistent order
    lowercased_text = top_recipe_text.lower()
    cleaned_text = remove_punctuation_except_periods(lowercased_text)
    cleaned_text = remove_non_alphanumeric_except_periods(cleaned_text)
    cleaned_text = break_numbers_from_letters(cleaned_text)
    cleaned_text = convert_written_numbers(cleaned_text)
    cleaned_text = convert_common_fractions(cleaned_text)
    cleaned_text = convert_fractions_to_decimals(cleaned_text)

    # Tokenize the cleaned text
    words = re.findall(r'\b\d+\.\d+\b|\b\w+\b', cleaned_text)

    # Filter valid words based on the ingredient database, numbers/decimals, and units
    filtered_words = filter_valid_words(words, df_c02e)

    # Add default quantity in front of the unit if necessary
    words_with_default_quantity = add_default_quantity(filtered_words)

    # Group words into tuples of 3
    grouped_words = group_into_tuples(filtered_words)

    # Reorder tuples to have the order ingredient, quantity, unit
    reordered_tuples = reorder_tuples(grouped_words)

    # Convert quantities and units to grams
    converted_tuples = convert_to_grams(reordered_tuples)

    # Add labels to the reordered tuples
    labeled_tuples = add_labels_to_tuples(converted_tuples)

    # Append the cleaned text to the list
    cleaned_recipe_texts.append(cleaned_text)

    #Calc total C02e
    total_co2e = calculate_total_co2e(labeled_tuples, df_c02e)


# Print the cleaned text, tokenized text, and filtered words for reference
#    print(f"\nCleaned Recipe Text {1}:\n{cleaned_text}\n")
#    print(f"\nTokenized Text {idx}:\n{words}\n")
#    print(f"\nFiltered Words {idx}:\n{filtered_words}\n")
#    print(f"\nGrouped Words {idx}:\n{grouped_words}\n")
#    print(f"\nReordered Tuples {idx}:\n{reordered_tuples}\n")
#    print(f"\nLabeled Tuples {idx}:\n{labeled_tuples}\n")
    # Print total CO2e for the recipe
    print(f'\nTotal CO2e for the recipe {idx}: {total_co2e} g C02e\n')


Available recipes:
1. AsianPlant
2. SmokyPlant
3. BasicMeat
4. TacoPlant
Enter the number corresponding to the recipe you want to check: 1

Selected Input Recipe:
development of a plant-base prototype to replace beef protein, we want to explore the use of fiber (flaxseed to add fiber to the patty) and chicken peas flour and asian profiles (tamari / rice vinegar considered bases for asian cuisine)

Input Recipe Base: plant


Recipes matching both base and flavor keyword:
- Recipe: Plant-based asian
  Ingredients: Plant-based protein (tofu, tempeh, edamame), soy sauce, ginger, garlic, sesame oil, green onions, broccoli
  Base: plant


Top 3 Matching Recipes:
1. Recipe: PlantSmoky2, Similarity: 0.4846583604812622
   Text: Seitan 2g, chipotle peppers in adobo 2g, liquid smoke 2g, smoked paprika 2g, tamari 2g, garlic powder 2g, olive oil 2g

2. Recipe: PlantSmoky1, Similarity: 0.4739757776260376
   Text: Extra-firm tofu 5g, liquid smoke 5g, smoked paprika 5g , maple syrup 2g, soy sauce 2g, 