In [2]:
from google.colab import files
uploaded = files.upload()



Saving usda_labeled.json to usda_labeled.json


In [4]:
import pandas as pd
import json

# Replace with your actual filename
file_name = 'usda_labeled.json'

# Read entire file as lines and parse each line as JSON
data = []
with open(file_name, 'r') as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))

# Convert to DataFrame
df = pd.DataFrame(data)

df.head(2)


Unnamed: 0,fdcId,description,calories_kcal,protein_g,carbs_g,fat_g,fiber_g,sugar_g,calcium_mg,sodium_mg,vitamin_c_mg,LabelList,nonveg,dairytype
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",307.0,5.88,41.2,13.2,1.2,5.88,,1060.0,,"[biscuits, buttermilk, pillsbury]",0,1
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",330.0,4.34,53.4,11.3,1.4,21.3,28.0,780.0,0.1,"[dough, icing, rolls]",0,1


In [5]:
df.isnull().sum()

Unnamed: 0,0
fdcId,0
description,0
calories_kcal,0
protein_g,0
carbs_g,0
fat_g,0
fiber_g,619
sugar_g,1883
calcium_mg,89
sodium_mg,89


In [6]:
df.shape

(7890, 14)

In [7]:
df.fillna(0, inplace=True)

In [8]:
df.isnull().sum()

Unnamed: 0,0
fdcId,0
description,0
calories_kcal,0
protein_g,0
carbs_g,0
fat_g,0
fiber_g,0
sugar_g,0
calcium_mg,0
sodium_mg,0


In [9]:
!pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz
from scipy.sparse import csr_matrix



# Define nutritional columns
nutritional_cols = [
    'calories_kcal', 'protein_g', 'carbs_g', 'fat_g', 'fiber_g',
    'sugar_g', 'calcium_mg', 'sodium_mg', 'vitamin_c_mg'
]

# Handle NaN values
df[nutritional_cols] = df[nutritional_cols].fillna(0)
df['description'] = df['description'].fillna('')
df['LabelList'] = df['LabelList'].apply(lambda x: x if isinstance(x, list) else [])
df['nonveg'] = df['nonveg'].fillna(0)
df['dairytype'] = df['dairytype'].fillna(0)

# Verify no NaN in nutritional columns
if df[nutritional_cols].isna().any().any():
    raise ValueError(f"NaN values found in nutritional columns: {df[nutritional_cols].isna().sum()}")

# Preprocess: Handle outliers
for col in nutritional_cols:
    upper_limit = df[col].quantile(0.99)
    df[col] = df[col].clip(upper=upper_limit)
df['sodium_mg'] = np.log1p(df['sodium_mg'])
df['fat_g'] = np.log1p(df['fat_g'])

# Standardize nutritional data
scaler = StandardScaler()
df[nutritional_cols] = scaler.fit_transform(df[nutritional_cols])

# Extract nutritional features
X_nutrition = df[nutritional_cols].values

# Compute nutritional similarity
nutrition_similarity = cosine_similarity(X_nutrition)
nutrition_similarity = csr_matrix(nutrition_similarity)

# Prepare text data
df['text'] = df['description'] + ' ' + df['LabelList'].apply(lambda x: ' '.join(x if isinstance(x, list) else []))
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_text = tfidf.fit_transform(df['text'])

# Compute text similarity
text_similarity = cosine_similarity(X_text, dense_output=False)

def find_replacements(product_name, fdc_id, top_n=5, nutrition_weight=0.7, text_weight=0.3):
    """
    Find replacement ingredients for the input ingredient (product_name) within the same recipe
    (identified by fdc_id), ensuring nutritional similarity, recipe compatibility, and dietary
    restrictions. Excludes the input ingredient and prepared dishes. Returns only product names.

    Parameters:
    - product_name (str): Ingredient to replace (e.g., 'pork').
    - fdc_id (int/str): fdcId to identify the recipe.
    - top_n (int): Number of replacements to return.
    - nutrition_weight (float): Weight for nutritional similarity.
    - text_weight (float): Weight for text similarity.

    Returns:
    - List of product names, or None if invalid input or no matches.
    """
    fdc_id = str(fdc_id)
    matches = df[df['fdcId'].astype(str) == fdc_id]

    if matches.empty:
        print(f"No recipe found with fdcId: {fdc_id}")
        return None

    input_idx = matches.index[0]
    input_product = df.loc[input_idx, 'description']
    input_fdcid = df.loc[input_idx, 'fdcId']
    input_nonveg = df.loc[input_idx, 'nonveg']
    input_dairytype = df.loc[input_idx, 'dairytype']
    input_tags = df.loc[input_idx, 'LabelList']

    # Use product_name as the ingredient to replace
    core_ingredient = product_name.lower()

    # Validate core ingredient
    if core_ingredient not in input_product.lower() and core_ingredient not in [tag.lower() for tag in input_tags]:
        print(f"Warning: '{core_ingredient}' not found in recipe description or tags. Proceeding anyway.")

    print(f"Selected recipe: {input_product} (fdcId: {input_fdcid})")
    print(f"Replacing ingredient: {core_ingredient} in recipe context: {input_tags}")

    # Get similarity scores
    nutrition_sim_scores = nutrition_similarity[input_idx].toarray().flatten()
    text_sim_scores = text_similarity[input_idx].toarray().flatten()

    # Combine scores
    combined_scores = nutrition_weight * nutrition_sim_scores + text_weight * text_sim_scores

    # Get top candidates
    top_indices = np.argsort(combined_scores)[::-1]
    top_indices = [idx for idx in top_indices if idx != input_idx][:top_n * 5]  # Extra candidates

    # Filter replacements
    top_products = df.iloc[top_indices][['description', 'LabelList', 'nonveg', 'dairytype']]

    # Filter for recipe context (same recipe compatibility)
    recipe_tags = [tag for tag in input_tags if tag in ['chinese', 'stir-fry', 'sandwich', 'baking', 'roasted', 'grilled', 'salad', 'soup', 'curry', 'stew']]
    if not recipe_tags:
        recipe_tags = [tag for tag in input_tags if tag not in [core_ingredient]]  # Exclude ingredient tag
    top_products = top_products[top_products['LabelList'].apply(
        lambda x: any(tag in (x if isinstance(x, list) else []) for tag in recipe_tags)
    )]

    # Filter for raw/similar ingredients
    # Use text similarity to find related ingredients, excluding the input ingredient
    top_products = top_products[
        ~top_products['description'].str.lower().str.contains(core_ingredient, na=False) &
        ~top_products['LabelList'].apply(lambda x: core_ingredient in [tag.lower() for tag in (x if isinstance(x, list) else [])])
    ]

    # Filter for ingredients (not dishes) by excluding prepared foods
    exclude_terms = [
        'restaurant', 'prepared', 'cooked', 'baked', 'roasted', 'grilled', 'fried',
        'bologna', 'sausage', 'pie', 'crust', 'pastry', 'low fat', 'processed',
        'general tso', 'orange chicken', 'sweet and sour', 'lemon chicken', 'sesame chicken',
        'tenders', 'nuggets', 'dish', 'meal', 'recipe'
    ]
    top_products = top_products[
        ~top_products['description'].str.lower().str.contains('|'.join(exclude_terms), na=False) &
        ~top_products['LabelList'].apply(lambda x: any(term in [tag.lower() for tag in (x if isinstance(x, list) else [])] for term in exclude_terms))
    ]

    # Filter for dietary restrictions
    top_products = top_products[
        (top_products['nonveg'] == input_nonveg) &
        (top_products['dairytype'] == input_dairytype)
    ]

    # Return product names
    if top_products.empty:
        print(f"No suitable replacements found for '{core_ingredient}' in recipe context {recipe_tags}.")
        return None

    return top_products['description'].head(top_n).tolist()








In [18]:
if __name__ == "__main__":
    product_name = input("Enter ingredient to replace : ")
    fdc_id = input("Enter fdcId of recipe : ")
    replacements = find_replacements(product_name, fdc_id)

    if replacements is not None:
        print(f"\nReplacements for '{product_name}' in recipe:")
        for name in replacements:
            print(name)

Enter ingredient to replace : apple
Enter fdcId of recipe : 170959
Selected recipe: Babyfood, juice, apple (fdcId: 170959)
Replacing ingredient: apple in recipe context: ['babyfood', 'juice', 'apple']

Replacements for 'apple' in recipe:
Babyfood, juice, mixed fruit
Babyfood, juice, orange
Babyfood, juice, prune and orange
Babyfood, juice, orange and apricot
Babyfood, juice, pear


In [17]:
pork_recipes = df[df['LabelList'].apply(lambda labels: 'apple' in labels)]
pork_recipes[['fdcId', 'description', 'LabelList']]


Unnamed: 0,fdcId,description,LabelList
10,167522,"Pie, Dutch Apple, Commercially Prepared","[pie, apple, commercially]"
1304,168816,"Fruit butters, apple","[butters, fruit, apple]"
3447,170959,"Babyfood, juice, apple","[babyfood, juice, apple]"
3465,170977,"Babyfood, dessert, dutch apple, junior","[babyfood, dessert, apple]"
3834,171346,"Babyfood, fruit, apple and raspberry, junior","[babyfood, fruit, apple]"
4213,171725,"Custard-apple, (bullock's-heart), raw","[custard, raw, apple]"
5727,173239,"Pie, apple, commercially prepared, unenriched ...","[flour, pie, apple]"
7476,174988,"Croissants, apple","[croissants, apple]"
7499,175011,"Pie, apple, commercially prepared, enriched flour","[flour, pie, apple]"
7500,175012,"Pie, apple, prepared from recipe","[pie, recipe, apple]"
