In [15]:
import matplotlib.pyplot as plt
from PIL import Image
from tensorflow.keras.preprocessing.image import ImageDataGenerator  
import numpy as np 
import pandas as pd
from string import ascii_letters
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

# PREPROCESSING THE INGREDIENTS

In [11]:
# need to remove these from ingredients
measurements = (['tablespoon', 'tbsp', 'teaspoon', 'tsp', 'cup', 'pint', 'pt', 
                 'quart', 'qt', 'gallon', 'gal', 'ounce', 'ounc', 'ounces', 'oz', 'fluid', 'fl', 'pound', 
                 'lb', 'liter', 'litre', 'l', 'ml', 'gram', 'g', 'inch', 'diameter', 'meter', 'medium',
                 'grill', 'cm', 'handful', 'size', 'firm', 'cupsg', 'cupsml', 'x', 'little'])

In [12]:
is_noun = lambda pos: pos[:2] == 'NN'
stemmer = nltk.stem.PorterStemmer()

all_ingredients = []

def parse_ingredients(row):
    list = row['Cleaned_Ingredients'].split(',')
    ingredients = []
    for ingredient in list:
        # removing non-letters
        letter_only = ''.join(l for l in ingredient if l in set(ascii_letters + ' '))
        # tokenizing into words
        tokenized = nltk.word_tokenize(letter_only)
        # remove all except nouns, and remove measurements
        nouns = [word.lower() for (word, pos) in nltk.pos_tag(tokenized) if(pos[:2] == 'NN' or pos[:2] == 'NNS')]
        nouns = [noun for noun in nouns if (noun not in measurements) and (stemmer.stem(noun) not in measurements)]
        if len(tokenized) > 0 and len(nouns) == 0:
            nouns.append(tokenized[-1])
        # add as new row, also add to a full ingredient list as features
        joined = ' '.join(nouns)
        ingredients.append(joined)
        if joined not in all_ingredients:
            all_ingredients.append(joined)
    return ingredients

In [13]:
df = pd.read_csv('data/food.csv')

#
# UNCOMMENT THIS TO PARSE THE ENTIRE CSV
df['parsed_ingredients'] = df.apply(parse_ingredients, axis=1)
#

print(df['parsed_ingredients'].head())

0    [chicken, kosher salt, divided, more, squash t...
1    [egg whites, potatoes, salt, pepper, rosemary,...
2    [milk, milk, powder, onion powder, paprika, pe...
3    [round loaf, cut cubes, oil, divided, sausage,...
4    [dark brown sugar, water, bourbon, lemon juice...
Name: parsed_ingredients, dtype: object


In [None]:
print(all_ingredients)

# GETTING EMBEDDINGS FROM IMAGE

In [17]:
from transformers import CLIPProcessor, CLIPModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# load CLIP
model_name = "openai/clip-vit-base-patch32"
clip_processor = CLIPProcessor.from_pretrained(model_name)
clip_model = CLIPModel.from_pretrained(model_name)
clip_model.eval()

# get image encodings for our model
def encode_image(image_path):
    # returns pytorch tensors
    image = Image.open(image_path)
    inputs = clip_processor(images=image, return_tensors="pt")

    # encode image, stop gradient calculation and backpropagation
    with torch.no_grad():
        image_features = clip_model.get_image_features(**inputs)

    return image_features

# Example usage
image_path = 'data/images/images/3-ingredient-blueberry-champagne-granita.jpg'
image_features = encode_image(image_path)
print(image_features.shape)  # Shape of the image features (batch_size, feature_dim)

# The image features will be a tensor of shape (1, 512), where 512 is the dimensionality of the image embeddings.

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


torch.Size([1, 512])
