In [281]:
#CleanList App: Build grocery lists, upload files, and produce three tiered lists with a per-item and per-list "CleanScore".
#Main Dependencies & Librariies
#!pip install streamlit; pandas openpyxl; torch; ipynb-py-convert
from __future__ import annotations
import io, re, unicodedata, torch
from typing import Dict, List, Optional, Tuple
import pandas as pd, streamlit as st, base64 

#Extra Stuff
#!pip install transformers; sentence-transformers; plotly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sentence_transformers import CrossEncoder
import matplotlib.pyplot as plt, seaborn as sns, plotly.express as px, textwrap


In [265]:
# Funciton for Text Normalization: normalizes unicode and collapses whitespace
def normalize_text(s: str) -> str: 
    if not isinstance(s, str): 
        return ""                         # Type Safety - prevents crashes if a non-string sneaks in.
    s = s.lower() 	                      # Lowercase
    s = unicodedata.normalize("NFKC", s)  # Unicode Normalization
    s = re.sub(r"\s+", " ", s).strip()    # Collapse Whitespace
    s = re.sub(r"[^\w\s]", "", s)         # Punctuation removal
    return s 

In [266]:
# #Load and set all ingredient arrays
# Ingredient_List = pd.read_excel("Ingredient List.xlsx", engine="openpyxl")

# # Display the first few rows to confirm structure
# Ingredient_List.head()

# # Extract the column for each respective ingredient category
# STRONG_NEG = set(df["STRONG_NEG"].dropna().tolist() )
# MOD_NEG = set(df["MOD_NEG"].dropna().tolist() )
# POSITIVE = set(df["POSITIVE"].dropna().tolist() )
# ULTRA_PROCESSED_HINTS = set(df["ULTRA_PROCESSED_HINTS"].dropna().tolist() )
# WHOLE_FOOD_HINTS = set(df["WHOLE_FOOD_HINTS"].dropna().tolist() )

# # Sanity check
# print(f"Total number of ingredients: \n")
# number_of_STRONG_NEG_ingredients = len(STRONG_NEG)
# number_of_MOD_NEG_ingredients = len(MOD_NEG)
# number_of_POSITIVE_ingredients = len(POSITIVE)
# number_of_ULTRA_PROCESSED_HINTS_ingredients = len(ULTRA_PROCESSED_HINTS)
# number_of_WHOLE_FOOD_HINTS_ingredients = len(WHOLE_FOOD_HINTS)
# print(f"STRONG_NEG: {number_of_STRONG_NEG_ingredients}")
# print(f"MOD_NEG: {number_of_MOD_NEG_ingredients}")
# print(f"POSITIVE: {number_of_POSITIVE_ingredients}")
# print(f"ULTRA_PROCESSED_HINTS: {number_of_ULTRA_PROCESSED_HINTS_ingredients}")
# print(f"WHOLE_FOOD_HINTS: {number_of_WHOLE_FOOD_HINTS_ingredients}")


In [267]:
#Heuristic CleanScore engine
STRONG_NEG = [  #sugars/syrups 
                "sugar", "high fructose corn syrup", "corn syrup",
    
                #trans & hydrogenated fats 
                "partially hydrogenated", "hydrogenated", "trans fat", "shortening", "bacon", "sausage"
    
                # preservatives/antioxidants 
                "bht", "bha", "tbhq", "propyl gallate",
    
                # msg & intense additives 
                "monosodium glutamate", "msg",
    
                # colors 
                "red 40", "yellow 5", "yellow 6", "blue 1", "blue 2", "caramel color", 
    
                # nitrites/nitrates 
                "sodium nitrite", "sodium nitrate", "nitrite", "nitrate",
    
                # emulsifiers/solvents with concern 
                "polysorbate 80", "propylene glycol", "titanium dioxide",
    
                # bromated flours 
                "potassium bromate", "bromated",
    
                # intense sweeteners 
                "aspartame", "sucralose", "acesulfame", "saccharin",
    
                # gums sometimes flagged 
                "carboxymethylcellulose", "carrageenan",
    
                # generic artificial 
                "artificial flavor", "artificial flavours", "artificial color", "artificial colour", ] 

MOD_NEG = [ # common added sugars / refined carbs 
            "cane sugar", "glucose", "dextrose", "fructose", "invert sugar", "maltodextrin", "enriched flour", "bleached flour", "white flour", "refined flour", 
            
            # seed/veg oils (context dependent, treated as moderate) 
            "canola oil", "soybean oil", "vegetable oil", "palm oil", "corn oil", "rapeseed oil", 
    
            # additives/preservatives 
            "sodium benzoate", "potassium sorbate", "disodium phosphate", "phosphate", "xanthan gum", "guar gum", "natural flavor", "natural flavour" ] 

POSITIVE = [ # quality signals / processing 
            "honey", "organic", "non-gmo", "no added sugar", "unsweetened", "unsalted", "low sodium", "sprouted", "fermented", "probiotic", 
    
            # proteins/fats 
            "grass-fed", "pasture-raised", "wild-caught", "extra virgin olive oil", "olive oil", "avocado oil", 
    
            # grains/legumes 
            "whole grain", "100% whole", "whole wheat", "steel-cut oats", "brown rice", "quinoa", "lentils", "beans", "chickpeas", "high fiber", 
    
            # produce & dairy descriptors 
            "spinach", "kale", "broccoli", "berries", "plain yogurt", "greek yogurt", "whole milk yogurt",
    
            # cured meat improvements 
            "no nitrate", "no nitrite", "nitrite-free", "nitrate-free" 

            #Seasoning
            "onion", "garlic", "pepper",] 


In [268]:
# Item-name category nudges 
# ULTRA_PROCESSED_HINTS = [ "chips", "soda", "soft drink", "energy drink", 
#                          "candy", "cookie", "cake", "donut", "donuts", 
#                          "instant noodle", "ramen", "frozen dinner", 
#                          "processed meat", "hot dog", "bacon", "sausage",
#                          "sugary cereal", "sweetened cereal", "ice cream", 
#                          "fries", "fried" ] 

# WHOLE_FOOD_HINTS = [ "spinach", "kale", "broccoli", "berries", "apple", 
#                     "banana", "carrot", "lettuce", "tomato", "avocado", 
#                     "onion", "garlic", "pepper", "orange", "grapefruit", 
#                     "quinoa", "oats", "oatmeal", "lentils", "beans", 
#                     "chickpeas", "nuts", "almonds", "walnuts", "sunflower seeds", 
#                     "pumpkin seeds", "yogurt", "plain yogurt", "greek yogurt", 
#                     "eggs", "salmon", "sardines" ]


In [269]:
#Function to compute a bounded CleanScore (0-100) using keyword matching across multiple categories.
def score_item(item: str, ingredients: Optional[str]) -> Tuple[int, Dict[str, List[str]]]:
    base = 50
    item_text = normalize_text(item) 
    ing_text = " ".join(normalize_text(i) for i in ingredients or [item]) #normalize_text(ingredients or item)
    matched = {"positive": [], "moderate_neg": [], "strong_neg": []}
    
    # Positive signals 
    for kw in POSITIVE: 
        if kw in ing_text: 
            matched["positive"].append(kw) 
            
    # Negative signals (moderate) 
    for kw in MOD_NEG: 
        if kw in ing_text: 
            matched["moderate_neg"].append(kw) 
     
    # Strong negatives 
    for kw in STRONG_NEG: 
        if kw in ing_text: 
            matched["strong_neg"].append(kw)    
     
    # Item-level hints 
    # for kw in WHOLE_FOOD_HINTS: 
    #     if kw in item_text: 
    #         matched["whole_food"].append(kw) 
    # for kw in ULTRA_PROCESSED_HINTS: 
    #     if kw in item_text: 
    #         matched["ultra_processed"].append(kw) 
            
    # Scoring logic (bounded) 
    score = base 
    score += 14 * len(matched["positive"])                  # per unique positive 
    score -= 7 * len(matched["moderate_neg"])              # per unique moderate negative 
    score -= 14 * len(matched["strong_neg"])               # per unique strong negative 
    # score += 5 * min(2, len(matched["whole_food"]))        # small boosts for whole-food hints 
    # score -= 10 * min(2, len(matched["ultra_processed"]))  # small penalties for ultra-processed hints
    score = max(0, min(100, int(round(score)))) 
    return score, matched

In [273]:
# --- Predefined Grocery Database  ---
grocery_db = {
    "granola": ["organic oats", "cane sugar", "honey", "almonds"],
    "peanut butter": ["roasted peanuts", "salt", "palm oil"],
    "instant noodles": ["wheat flour", "salt", "monosodium glutamate", "vegetable oil"],
    "almond milk": ["filtered water", "almonds", "calcium carbonate", "vitamin D2"],
    "protein bar": ["soy protein isolate", "chocolate chips", "sugar", "natural flavors"]
}

In [292]:
#                                                           --- Streamlit UI ---

#  ******** Header design ******** 

#st.title("🛒 CleanList")

# Function to convert Logo to base64 so it can be embedded directly
def convert_logo_to_base64(logo_file_path):
    with open(logo_file_path, "rb") as f:
        data = f.read()
    return base64.b64encode(data).decode()
    
# Load and encode the logo
Logo = convert_logo_to_base64("CleanList logo.png")

# Display your logo in the top-left corner with a clickable link
st.logo("CleanList logo.png", size="large", link="https://cleanlist.com")
# st.markdown(f""" <a href="https://cleanlist.com" target="_blank"> 
#             <img src="data:image/png;base64,{Logo}" width="150"></a>""", unsafe_allow_html=True)

#  ******** Item Scoring ********
st.markdown(" ")
user_input = st.text_input("Enter the item name", placeholder="e.g: Granola")
# st.write("You entered:", user_input)

if user_input:
    item = normalize_text(user_input)
    if item in grocery_db:
        ingredients = grocery_db[item]
        st.success(f"Ingredients for '{user_input}':")
        st.write(", ".join(ingredients))
        final_score = score_item(item, ingredients)

        # Decide the color based on the score
        if final_score[0] < 50:
            color = "red"
        elif final_score[0] < 80:
            color = "orange"  # yellow can be hard to read on white background
        else:
            color = "green"

        # Display with colored text
        st.markdown(f"<h3 style='color:{color}'>CleanScore = {final_score[0]}/100</h3>", unsafe_allow_html=True)
        
    else:
        st.error("Item not found.")
    



In [270]:
#Function for dampening the raw score
# def apply_dampening(base_score: float, ingredients: list[str], STRONG_NEG: set, MOD_NEG: set, POSITIVE: set) -> float:
#     # Normalize arguments
#     ingredients = [normalize_text(i) for i in ingredients]
#     STRONG_NEG = set(normalize_text(i) for i in STRONG_NEG)
#     MOD_NEG = set(normalize_text(i) for i in MOD_NEG)
#     POSITIVE = set(normalize_text(i) for i in POSITIVE)

#     # Flags for presence
#     has_strong_neg = any(i in STRONG_NEG for i in ingredients)
#     has_mod_neg = any(i in MOD_NEG for i in ingredients)
#     has_positive = any(i in POSITIVE for i in ingredients)

#     # Apply dampening logic
#     if has_positive and has_strong_neg:
#         base_score *= 0.75  # Strong conflict
#     elif has_positive and has_mod_neg:
#         base_score *= 0.85  # Moderate conflict
#     elif has_positive:
#         base_score *= 1.05  # Slight boost if no negatives
#     return round(base_score, 2)

In [271]:
#Wrapping function for scoring and dampening
# def score_with_dampening(item: str, ingredients: Optional[str]) -> float:
#     raw_score, details = score_item(item, ingredients)
#     all_signals = details["positive"] + details["moderate_neg"] + details["strong_neg"]
#     return raw_score, apply_dampening(raw_score, all_signals, STRONG_NEG, MOD_NEG, POSITIVE), details

In [275]:
#StreaLit App directory: cd "OneDrive - Emory\Entreprenuership\CleanList\Python" 
#Run these in command prompt to test Strealit App:
    # 1) ipynb-py-convert CleanList.ipynb CleanList.py
    # 2) streamlit run CleanList.py   
#To end run in terminal, use "CTRL + C"

In [272]:
item = "Vanilla"
ingredients = ["Organic cane sugar", "oats", "honey"]
final_score = score_item(item, ingredients)

#Sanity check
print("Ingredient Categorization = ",final_score[1])
print("Raw Score = ",final_score[0])

Ingredient Categorization =  {'positive': ['honey', 'organic'], 'moderate_neg': ['cane sugar'], 'strong_neg': ['sugar']}
Raw Score =  57
