In [1]:
# Required packages
import pandas as pd
import numpy as np

In [2]:
# Import functions
from data_processing import *

### Load data

In [3]:
info = pd.read_csv('info.csv', index_col=0)
info.drop('Ingredients', axis=1, inplace=True) # Drop ingredients since we will use the ´Cleaned_Ingredients´ strings
info.dropna(inplace=True) # Drop null values

In [4]:
info.head()

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
0,Miso-Butter Roast Chicken With Acorn Squash Pa...,"Pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher..."
1,Crispy Salt and Pepper Potatoes,Preheat oven to 400°F and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,"['2 large egg whites', '1 pound new potatoes (..."
2,Thanksgiving Mac and Cheese,Place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,"['1 cup evaporated milk', '1 cup whole milk', ..."
3,Italian Sausage and Bread Stuffing,Preheat oven to 350°F with rack in middle. Gen...,italian-sausage-and-bread-stuffing-240559,"['1 (¾- to 1-pound) round Italian loaf, cut in..."
4,Newton's Law,Stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,"['1 teaspoon dark brown sugar', '1 teaspoon ho..."


In [5]:
info.describe() # We can see null image names are called ´#NAME?´ 

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
count,13493,13493,13493,13493
unique,13302,13464,13464,13471
top,Potato Latkes,Place ingredients in blender in the order list...,#NAME?,['']
freq,5,5,30,6


In [6]:
info = info[info['Image_Name'] != '#NAME?'] # Drop null image names

In [7]:
info.describe()

Unnamed: 0,Title,Instructions,Image_Name,Cleaned_Ingredients
count,13463,13463,13463,13463
unique,13273,13434,13463,13441
top,French 75,Place ingredients in blender in the order list...,miso-butter-roast-chicken-acorn-squash-panzanella,['']
freq,5,5,1,6


### Statistical analysis

#### Images analysis

In [8]:
# Analyze the dimensions of images
folder = 'Food Images'

look_images(info, folder)    

KeyboardInterrupt: 

#### Titles analysis

In [None]:
# Analyze strings
lengths = info['Title'].str.len()
print("Title lengths:")
print(f"Longest title: {max(lengths)} characters.")
print(f"Shortest title: {min(lengths)} characters.")
print(f"Mean title length: {round(np.mean(lengths),2)} characters.")
print(f"Standard Deviation of title lengths: {round(np.std(lengths),2)} characters.")

Title lengths:
Longest title: 112 characters.
Shortest title: 3 characters.
Mean title length: 32.75 characters.
Standard Deviation of title lengths: 14.76 characters.


In [None]:
# Analyze characters
look_char(info, 'Title')


Unique characters: 136

 !"#%&'()+,-./012345679:;ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz ®ÁÉàáâãäçèéêëìíîïñòóôöøùúûüōờ́̃̉Сикнры –—‘’“”강개닭된장전정찌파

Character: <control>
Appearances: 3

Character: SPACE
Appearances: 13255

Character: EXCLAMATION MARK
Appearances: 1

Character: QUOTATION MARK
Appearances: 101

Character: NUMBER SIGN
Appearances: 1

Character: PERCENT SIGN
Appearances: 1

Character: AMPERSAND
Appearances: 81

Character: APOSTROPHE
Appearances: 330

Character: LEFT PARENTHESIS
Appearances: 294

Character: RIGHT PARENTHESIS
Appearances: 294

Character: PLUS SIGN
Appearances: 2

Character: COMMA
Appearances: 1030

Character: HYPHEN-MINUS
Appearances: 3152

Character: FULL STOP
Appearances: 25

Character: SOLIDUS
Appearances: 6

Character: DIGIT ZERO
Appearances: 10

Character: DIGIT ONE
Appearances: 18

Character: DIGIT TWO
Appearances: 15

Character: DIGIT THREE
Appearances: 76

Character: DIGIT FOUR
Appearances: 6

Character: DIGIT FIVE
Appearances: 21

Charac

#### Ingredients analysis

In [None]:
# Analyze strings
lengths = info['Cleaned_Ingredients'].str.len()
print("Ingredients lengths:")
print(f"Longest ingredient list: {max(lengths)} characters.")
print(f"Shortest ingredient list: {min(lengths)} characters.")
print(f"Mean ingredient list length: {round(np.mean(lengths),2)} characters.")
print(f"Standard Deviation of ingredient list lengths: {round(np.std(lengths),2)} characters.")

Ingredients lengths:
Longest ingredient list: 2379 characters.
Shortest ingredient list: 4 characters.
Mean ingredient list length: 419.14 characters.
Standard Deviation of ingredient list lengths: 202.22 characters.


In [None]:
# Analyze characters
look_char(info, 'Cleaned_Ingredients')

Unique characters: 147
 !"#%&'()*+,-./0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}®°¼½¾¿ÁÉ×àáâäåçèéêëìíîïñóôõùúûüōź̀́̂‐‑–—’‚“”•″⁄™⅓⅔⅛⅜⅞ﬀﬁﬂ�

Character: SPACE
Appearances: 13457

Character: EXCLAMATION MARK
Appearances: 9

Character: QUOTATION MARK
Appearances: 1879

Character: NUMBER SIGN
Appearances: 10

Character: PERCENT SIGN
Appearances: 276

Character: AMPERSAND
Appearances: 31

Character: APOSTROPHE
Appearances: 13463

Character: LEFT PARENTHESIS
Appearances: 10724

Character: RIGHT PARENTHESIS
Appearances: 10721

Character: ASTERISK
Appearances: 681

Character: PLUS SIGN
Appearances: 9

Character: COMMA
Appearances: 13436

Character: HYPHEN-MINUS
Appearances: 10184

Character: FULL STOP
Appearances: 1864

Character: SOLIDUS
Appearances: 12161

Character: DIGIT ZERO
Appearances: 2446

Character: DIGIT ONE
Appearances: 13244

Character: DIGIT TWO
Appearances: 13002

Character: DIGIT THREE
Appearances: 9807

Character: DIGIT FOUR
Appearances: 11162

C

#### Instructions analysis

In [None]:
# Analyze strings
lengths = info['Instructions'].str.len()
print("Instructions lengths:")
print(f"Longest instructions: {max(lengths)} characters.")
print(f"Shortest instructions: {min(lengths)} characters.")
print(f"Mean instructions length: {round(np.mean(lengths),2)} characters.")
print(f"Standard Deviation of instructions lengths: {round(np.std(lengths),2)} characters.")

Instructions lengths:
Longest instructions: 13952 characters.
Shortest instructions: 40 characters.
Mean instructions length: 1040.61 characters.
Standard Deviation of instructions lengths: 711.15 characters.


In [None]:
# Analyze characters
look_char(info, 'Instructions')

Unique characters: 174

 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_`abcdefghijklmnopqrstuvwxyz{|}¡§¬­®°±³´¹º»¼½¾¿ÁÉÑ×àáâäåçèéêëìíîïñóôõöùúûüÿōˆ˚́̊​‐‑–—‘’‚“”‟•…″⁄⅓⅔⅛⅜−◊ﬀﬁﬂ�

Character: <control>
Appearances: 11875

Character: <control>
Appearances: 1

Character: SPACE
Appearances: 13463

Character: EXCLAMATION MARK
Appearances: 209

Character: QUOTATION MARK
Appearances: 1567

Character: NUMBER SIGN
Appearances: 15

Character: DOLLAR SIGN
Appearances: 66

Character: PERCENT SIGN
Appearances: 15

Character: AMPERSAND
Appearances: 41

Character: APOSTROPHE
Appearances: 1404

Character: LEFT PARENTHESIS
Appearances: 6955

Character: RIGHT PARENTHESIS
Appearances: 6955

Character: ASTERISK
Appearances: 348

Character: PLUS SIGN
Appearances: 12

Character: COMMA
Appearances: 13230

Character: HYPHEN-MINUS
Appearances: 10037

Character: FULL STOP
Appearances: 13454

Character: SOLIDUS
Appearances: 7574

Character: DIGIT ZERO
Appearances: 9205

Character: DIGIT ONE
Ap