In [257]:
# Required packages
import pandas as pd
import numpy as np
import string
import re
import pickle
import random

In [2]:
# Import functions
from data_processing import *

### Load data

In [229]:
info = pd.read_csv('info.csv', index_col=0)
info.drop('Ingredients', axis=1, inplace=True) # Drop ingredients since we will use the ´Cleaned_Ingredients´ strings

In [230]:
info.head()

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [231]:
info.describe()

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
count,13496,13493,13501,13501
unique,13305,13464,13472,13473
top,Potato Latkes,Place ingredients in blender in the order list...,#NAME?,['']
freq,5,5,30,12


### Standardization

##### Drop NaNs

In [232]:
info.dropna(inplace=True)

In [233]:
info.describe() # We can see null image names are called ´#NAME?´ and null ingredients ´''´

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
count,13493,13493,13493,13493
unique,13302,13464,13464,13471
top,Potato Latkes,Place ingredients in blender in the order list...,#NAME?,['']
freq,5,5,30,6


In [234]:
info = info[info['Image_Name'] != '#NAME?'] # Drop null image names
info = info[info['Cleaned_Ingredients'].apply(lambda x: ' ' in x)]# Drop null ingredients

In [235]:
info.describe()

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
count,13457,13457,13457,13457
unique,13267,13428,13457,13440
top,Potato Latkes,Place ingredients in blender in the order list...,miso-butter-roast-chicken-acorn-squash-panzanella,"['1 cube or 1/2 teaspoon sugar', '4 dashes Pey..."
freq,5,5,1,4


##### Drop duplicates

In [236]:
info.drop_duplicates(subset='Title', inplace=True) # Keep first occurrence
info.drop_duplicates(subset='Instructions', inplace=True) # Keep first occurrence
info.drop_duplicates(subset='Cleaned_Ingredients', inplace=True) # Keep first occurrence

In [237]:
info.describe()

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
count,13250,13250,13250,13250
unique,13250,13250,13250,13250
top,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
freq,1,1,1,1


##### Drop titles with strange characters

In [238]:
# Define expected characters
good_letters = list(string.ascii_letters + string.digits + ' ' + '!"#%&\'(),-.:?¿')

chars = set() 

for sentence in info['Title']:
    for char in sentence:
        chars.add(char) # Add to the set of unique characters

bad_letters = [char for char in chars if char not in good_letters] # Select the strange characters
very_bad_letters = [re.escape(char) for char in bad_letters] # Ensure they are treated as characters and not as operands

In [239]:
# Drop titles containing strange characters
nasty_letters = '|'.join(very_bad_letters)
info = info[info['Title'].str.contains(nasty_letters) == False]

In [240]:
info.describe()

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
count,12520,12520,12520,12520
unique,12520,12520,12520,12520
top,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
freq,1,1,1,1


We have gone from around 13500 original rows to 12520 filtered rows.

### Statistical analysis

#### Images analysis

In [241]:
# Analyze the dimensions of images
folder = 'Food Images'

look_images(info, folder) # We will resize images to 274x169    

Processing Images: 100%|██████████| 12520/12520 [00:02<00:00, 5837.75it/s]

Width Stats:
  Max: 702, Min: 274, Mean: 274.03, Standard Deviation: 3.82
Height Stats:
  Max: 722, Min: 169, Mean: 169.04, Standard Deviation: 4.94





#### Titles analysis

In [242]:
# Analyze strings
lengths = info['Title'].str.len()
print("Title lengths:")
print(f"Longest title: {max(lengths)} characters.")
print(f"Shortest title: {min(lengths)} characters.")
print(f"Mean title length: {round(np.mean(lengths),2)} characters.")
print(f"Standard Deviation of title lengths: {round(np.std(lengths),2)} characters.")

Title lengths:
Longest title: 112 characters.
Shortest title: 3 characters.
Mean title length: 32.72 characters.
Standard Deviation of title lengths: 14.52 characters.


In [243]:
# Analyze characters
look_char(info, 'Title')


Unique characters: 74
 !"#%&'(),-.012345679:ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

Character: SPACE
Appearances: 12349

Character: EXCLAMATION MARK
Appearances: 1

Character: QUOTATION MARK
Appearances: 99

Character: NUMBER SIGN
Appearances: 1

Character: PERCENT SIGN
Appearances: 1

Character: AMPERSAND
Appearances: 75

Character: APOSTROPHE
Appearances: 312

Character: LEFT PARENTHESIS
Appearances: 253

Character: RIGHT PARENTHESIS
Appearances: 253

Character: COMMA
Appearances: 962

Character: HYPHEN-MINUS
Appearances: 2995

Character: FULL STOP
Appearances: 24

Character: DIGIT ZERO
Appearances: 9

Character: DIGIT ONE
Appearances: 16

Character: DIGIT TWO
Appearances: 15

Character: DIGIT THREE
Appearances: 76

Character: DIGIT FOUR
Appearances: 6

Character: DIGIT FIVE
Appearances: 17

Character: DIGIT SIX
Appearances: 2

Character: DIGIT SEVEN
Appearances: 6

Character: DIGIT NINE
Appearances: 1

Character: COLON
Appearances: 13

Character: LATIN CAPITAL LETTER A

#### Ingredients analysis

In [244]:
# Analyze strings
lengths = info['Cleaned_Ingredients'].str.len()
print("Ingredients lengths:")
print(f"Longest ingredient list: {max(lengths)} characters.")
print(f"Shortest ingredient list: {min(lengths)} characters.")
print(f"Mean ingredient list length: {round(np.mean(lengths),2)} characters.")
print(f"Standard Deviation of ingredient list lengths: {round(np.std(lengths),2)} characters.")

Ingredients lengths:
Longest ingredient list: 2379 characters.
Shortest ingredient list: 11 characters.
Mean ingredient list length: 419.86 characters.
Standard Deviation of ingredient list lengths: 201.77 characters.


In [245]:
# Analyze characters
look_char(info, 'Cleaned_Ingredients')

Unique characters: 142
 !"#%&'()*+,-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}®°¼½¾¿ÁÉ×àáâäåçèéêëìíîïñóôùúûüź́‐‑–—’‚“”•″⁄⅓⅔⅛⅜⅞ﬀﬁﬂ�

Character: SPACE
Appearances: 12520

Character: EXCLAMATION MARK
Appearances: 9

Character: QUOTATION MARK
Appearances: 1768

Character: NUMBER SIGN
Appearances: 9

Character: PERCENT SIGN
Appearances: 260

Character: AMPERSAND
Appearances: 28

Character: APOSTROPHE
Appearances: 12520

Character: LEFT PARENTHESIS
Appearances: 9997

Character: RIGHT PARENTHESIS
Appearances: 9995

Character: ASTERISK
Appearances: 621

Character: PLUS SIGN
Appearances: 9

Character: COMMA
Appearances: 12501

Character: HYPHEN-MINUS
Appearances: 9496

Character: FULL STOP
Appearances: 1772

Character: SOLIDUS
Appearances: 11330

Character: DIGIT ZERO
Appearances: 2265

Character: DIGIT ONE
Appearances: 12327

Character: DIGIT TWO
Appearances: 12103

Character: DIGIT THREE
Appearances: 9103

Character: DIGIT FOUR
Appearances: 10368

Character:

#### Instructions analysis

In [246]:
# Analyze strings
lengths = info['Instructions'].str.len()
print("Instructions lengths:")
print(f"Longest instructions: {max(lengths)} characters.")
print(f"Shortest instructions: {min(lengths)} characters.")
print(f"Mean instructions length: {round(np.mean(lengths),2)} characters.")
print(f"Standard Deviation of instructions lengths: {round(np.std(lengths),2)} characters.")

Instructions lengths:
Longest instructions: 12915 characters.
Shortest instructions: 40 characters.
Mean instructions length: 1037.14 characters.
Standard Deviation of instructions lengths: 699.28 characters.


In [247]:
# Analyze characters
look_char(info, 'Instructions')

Unique characters: 170

 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}¡§¬­®°³´¹º»¼½¾¿ÁÉÑ×àáâäåçèéêëìíîïñóôõöùúûüÿˆ˚́̊​‐‑–—‘’‚“”‟•…″⁄⅓⅔⅛⅜−◊ﬀﬁﬂ�

Character: <control>
Appearances: 11057

Character: SPACE
Appearances: 12520

Character: EXCLAMATION MARK
Appearances: 189

Character: QUOTATION MARK
Appearances: 1482

Character: NUMBER SIGN
Appearances: 14

Character: DOLLAR SIGN
Appearances: 57

Character: PERCENT SIGN
Appearances: 14

Character: AMPERSAND
Appearances: 35

Character: APOSTROPHE
Appearances: 1288

Character: LEFT PARENTHESIS
Appearances: 6467

Character: RIGHT PARENTHESIS
Appearances: 6467

Character: ASTERISK
Appearances: 317

Character: PLUS SIGN
Appearances: 12

Character: COMMA
Appearances: 12318

Character: HYPHEN-MINUS
Appearances: 9337

Character: FULL STOP
Appearances: 12511

Character: SOLIDUS
Appearances: 7067

Character: DIGIT ZERO
Appearances: 8594

Character: DIGIT ONE
Appearances: 11030

Character: DIGIT TWO
App

### Vectorization

In [None]:
# Pass data from the dataframe to python dictionaries
titles_dict = info.set_index('Image_Name')['Title'].to_dict()
ingredients_dict = info.set_index('Image_Name')['Cleaned_Ingredients'].to_dict()
instructions_dict = info.set_index('Image_Name')['Instructions'].to_dict()

In [261]:
def train_val_test_split(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=None, save=False, path=''):
    """
    Splits a dictionary into train, validation, and test sets.

    Args:
        data (dict): The dictionary to split.
        train_ratio (float): Proportion of data for the training set.
        val_ratio (float): Proportion of data for the validation set.
        test_ratio (float): Proportion of data for the test set.
        seed (int): Random seed for reproducibility.

    Returns:
        tuple: Three dictionaries (train_dict, val_dict, test_dict).
    """
    if seed is not None:
        random.seed(seed)
    
    # Ensure the ratios sum to 1
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1."
    
    # Shuffle the keys
    keys = list(data.keys())
    random.shuffle(keys)
    
    # Calculate split indices
    total_items = len(keys)
    train_end = int(total_items * train_ratio)
    val_end = train_end + int(total_items * val_ratio)
    
    # Split keys
    train_keys = keys[:train_end]
    val_keys = keys[train_end:val_end]
    test_keys = keys[val_end:]
    
    # Create the dictionaries
    train_dict = {key: data[key] for key in train_keys}
    val_dict = {key: data[key] for key in val_keys}
    test_dict = {key: data[key] for key in test_keys}

    if save:
        # Save final datasets
        with open('train_set.pkl', 'wb') as f:
            pickle.dump(train_dict, f)
        with open('validation_set.pkl', 'wb') as f:
            pickle.dump(val_dict, f)
        with open('test_set.pkl', 'wb') as f:
            pickle.dump(test_dict, f)
    
    print("Original:", len(data.keys()))
    print("Train:", len(train_dict.keys()))
    print("Validation:", len(val_dict.keys()))
    print("Test:", len(test_dict.keys()))
    
    return train_dict, val_dict, test_dict

In [263]:
# Perform data split. 80-10-10 split size by deafult.
train_dict, val_dict, test_dict = train_val_test_split(titles_dict, seed=42, save=True)

Original: 12520
Train: 10016
Validation: 1252
Test: 1252
