# Recipe Prediction Analysis

**Name(s)**: Coleman Clougherty and Jamera Fernando

**Website Link**: (your website link)

In [78]:
import pandas as pd
import numpy as np
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from dsc80_utils import * # Feel free to uncomment and use this.
import re
import matplotlib.pyplot as plt

## Step 1: Introduction

In [79]:
interactions = pd.read_csv(Path('data') /'RAW_interactions.csv')
recipes = pd.read_csv(Path('data') / 'RAW_recipes.csv')

In [80]:
recipes['nutrition']

0            [138.4, 10.0, 50.0, 3.0, 3.0, 19.0, 6.0]
1        [595.1, 46.0, 211.0, 22.0, 13.0, 51.0, 26.0]
2           [194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]
                             ...                     
83779            [59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]
83780       [188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]
83781        [174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]
Name: nutrition, Length: 83782, dtype: object

In [106]:
recipes['tags']

0        ['60-minutes-or-less', 'time-to-make', 'course...
1        ['60-minutes-or-less', 'time-to-make', 'cuisin...
2        ['60-minutes-or-less', 'time-to-make', 'course...
                               ...                        
83779    ['60-minutes-or-less', 'time-to-make', 'course...
83780    ['30-minutes-or-less', 'time-to-make', 'course...
83781    ['30-minutes-or-less', 'time-to-make', 'course...
Name: tags, Length: 83782, dtype: object

In [81]:
recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [82]:
interactions['recipe_id'].sort_values()

70222         38
70223         38
70221         38
           ...  
475342    537543
277439    537671
295955    537716
Name: recipe_id, Length: 731927, dtype: int64

In [83]:
interactions[interactions['review'].isna()].describe()

Unnamed: 0,user_id,recipe_id,rating
count,1.69e+02,169.00,169.00
mean,1.83e+09,214467.93,4.72
std,5.52e+08,163237.62,0.70
...,...,...,...
50%,2.00e+09,166775.00,5.00
75%,2.00e+09,356724.00,5.00
max,2.00e+09,533896.00,5.00


In [84]:
merged = recipes.merge(interactions, left_on='id', right_on='recipe_id', how='left')
merged.loc[merged['rating'] == 0, 'rating'] = np.nan
avg_rating = merged.groupby('id')['rating'].mean()
avg_rating

recipes2 = recipes.merge(avg_rating.rename('avg_rating'), left_on='id', right_index=True, how='left')
# recipes[recipes['avg_rating'].notnull()]
recipes2

Unnamed: 0,name,id,minutes,contributor_id,...,description,ingredients,n_ingredients,avg_rating
0,1 brownies in the world best ever,333281,40,985201,...,"these are the most; chocolatey, moist, rich, d...","['bittersweet chocolate', 'unsalted butter', '...",9,4.0
1,1 in canada chocolate chip cookies,453467,45,1848091,...,this is the recipe that we use at my school ca...,"['white sugar', 'brown sugar', 'salt', 'margar...",11,5.0
2,412 broccoli casserole,306168,40,50969,...,since there are already 411 recipes for brocco...,"['frozen broccoli cuts', 'cream of chicken sou...",9,5.0
...,...,...,...,...,...,...,...,...,...
83779,zydeco ya ya deviled eggs,308080,40,37779,...,"deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8,5.0
83780,cookies by design cookies on a stick,298512,29,506822,...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,1.0
83781,cookies by design sugar shortbread cookies,298509,20,506822,...,"i've heard of the 'cookies by design' company,...","['granulated sugar', 'shortening', 'eggs', 'fl...",7,3.0


In [85]:
recipes2['tags'].iloc[0]

"['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'for-large-groups', 'desserts', 'lunch', 'snacks', 'cookies-and-brownies', 'chocolate', 'bar-cookies', 'brownies', 'number-of-servings']"

In [136]:
def tags_to_list(tags):
    pattern =r"'([\w-]*)'"
    unused = r"('(.*)',)|(, '(.*)')"
    return re.findall(pattern, tags)

recipes2['tags_list'] = recipes2['tags'].transform(tags_to_list)

# nutrition_dict={}
# def nutrition_to_columns(nutrition):
#     pattern = r"\d+\.\d*"
#     matches = re.findall(pattern, nutrition)
#     nutrition_dict['calories'] = nutrition_dict.get('calories', []) + [float(matches[0])]
#     nutrition_dict['total_fat'] = nutrition_dict.get('fat', []) + [float(matches[1])]
#     nutrition_dict['sugar'] = nutrition_dict.get('sugar', []) + [float(matches[2])]
#     nutrition_dict['sodium'] = nutrition_dict.get('sodium', []) + [float(matches[3])]
#     nutrition_dict['protein'] = nutrition_dict.get('protein', []) + [float(matches[4])]
#     nutrition_dict['saturated_fat'] = nutrition_dict.get('saturated_fat', []) + [float(matches[5])]
#     nutrition_dict['carbohydrates'] = nutrition_dict.get('carbohydrates', []) + [float(matches[6])]

df_extracted = recipes2['nutrition'].str.findall(r"\d+\.\d*").apply(pd.Series)
df_extracted.columns = ['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']
df_extracted

recipes3 = pd.concat([recipes2, df_extracted], axis=1).drop(columns=['nutrition', 'tags'])
recipes3['tags_list'].iloc[0]

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'preparation',
 'for-large-groups',
 'desserts',
 'lunch',
 'snacks',
 'cookies-and-brownies',
 'chocolate',
 'bar-cookies',
 'brownies',
 'number-of-servings']

In [109]:
recipes3['tags_list'].iloc[501]

['30-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'cuisine',
 'preparation',
 'north-american',
 'main-dish',
 'beans',
 'vegetables',
 'mexican',
 'easy',
 'beginner-cook',
 'kid-friendly',
 'vegetarian',
 'dietary',
 'one-dish-meal',
 'inexpensive']

In [207]:
recipes3['avg_rating'].hist(nbins=10)

In [203]:
px.histogram(recipes3, x='avg_rating', nbins=10, title='Average Recipe Ratings', labels={'avg_rating':'Average Rating', 'count':'Count'})

In [88]:
data = interactions.groupby('recipe_id')['rating'].count()

In [89]:
px.box(data, x='rating', title='Number of Ratings per Recipe', log_x=True, labels = {'rating':'Number of Ratings (scaled log)'})

In [110]:
recipes3['tags_list']

0        [60-minutes-or-less, time-to-make, course, mai...
1        [60-minutes-or-less, time-to-make, cuisine, pr...
2        [60-minutes-or-less, time-to-make, course, mai...
                               ...                        
83779    [60-minutes-or-less, time-to-make, course, mai...
83780    [30-minutes-or-less, time-to-make, course, pre...
83781    [30-minutes-or-less, time-to-make, course, pre...
Name: tags_list, Length: 83782, dtype: object

In [130]:
all_tags = [str(tag) for sublist in recipes2['tags_list'] for tag in sublist]
unique_tags = list(set(all_tags))

In [131]:
all_tags

['60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'preparation',
 'for-large-groups',
 'desserts',
 'lunch',
 'snacks',
 'cookies-and-brownies',
 'chocolate',
 'bar-cookies',
 'brownies',
 'number-of-servings',
 '60-minutes-or-less',
 'time-to-make',
 'cuisine',
 'preparation',
 'north-american',
 'for-large-groups',
 'canadian',
 'british-columbian',
 'number-of-servings',
 '60-minutes-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'preparation',
 'side-dishes',
 'vegetables',
 'easy',
 'beginner-cook',
 'broccoli',
 'time-to-make',
 'course',
 'cuisine',
 'preparation',
 'occasion',
 'north-american',
 'desserts',
 'american',
 'southern-united-states',
 'dinner-party',
 'holiday-event',
 'cakes',
 'dietary',
 'christmas',
 'thanksgiving',
 'low-sodium',
 'low-in-something',
 'taste-mood',
 'sweet',
 '4-hours-or-less',
 'time-to-make',
 'course',
 'main-ingredient',
 'preparation',
 'main-dish',
 'potatoes',
 'vegetables',
 '4-hours-or-less',
 'meatl

In [132]:
unique_tags

['vegan',
 '',
 'winter',
 'high-fiber',
 'australian',
 'refrigerator',
 'ham-and-bean-soup',
 'ontario',
 'course',
 'greens',
 'seafood',
 'whole-turkey',
 'st-patricks-day',
 'new-years',
 'somalian',
 'deep-fry',
 'cuisine',
 'pot-roast',
 'cheese',
 'oamc-freezer-make-ahead',
 'number-of-servings',
 'apples',
 'bread-machine',
 'sandwiches',
 'raspberries',
 'baked-beans',
 'tex-mex',
 'mixer',
 'savory',
 'to-go',
 'south-west-pacific',
 'orange-roughy',
 'microwave',
 'lamb-sheep-main-dish',
 'pork-loin',
 'pies-and-tarts',
 'melons',
 'heirloom-historical',
 'elbow-macaroni',
 'cinco-de-mayo',
 'presentation',
 'savory-sauces',
 'meat',
 'veal',
 'camping',
 'one-dish-meal',
 'egyptian',
 'mushroom-soup',
 'for-large-groups',
 'high-protein',
 'chicken-breasts',
 'wedding',
 'oranges',
 'steaks',
 'cod',
 'pressure-canning',
 'simply-potatoes',
 'nigerian',
 'russian',
 'oysters',
 'italian',
 'fudge',
 'low-fat',
 'english',
 'spinach',
 'crock-pot-slow-cooker',
 'portuguese'

In [None]:
recipes3['name'] = recipes3['name'].fillna('')

In [138]:
ser_tags = pd.Series(unique_tags)

In [162]:
ser_tags[ser_tags.str.contains('india')]

420    indian
dtype: object

In [122]:
'food' in unique_tags

False

In [148]:
def tag_contains(tags, name):
    name = name.lower()
    return name in tags

christmas = recipes3[recipes3['tags_list'].apply(tag_contains, name='halloween')]
christmas[christmas['tags_list'].apply(tag_contains, name='holiday-event')].shape[0]
# christmas.shape[0]

229

In [189]:
american[american['is_american']]

Unnamed: 0,name,id,minutes,contributor_id,...,protein,saturated_fat,carbohydrates,is_american
3,millionaire pound cake,286009,120,461724,...,20.0,123.0,39.0,True
11,rter med flsk pea soup with pork,333797,195,64642,...,44.0,12.0,0.0,True
15,go to bbq sauce for ribs,495314,13,488441,...,2.0,0.0,19.0,True
...,...,...,...,...,...,...,...,...,...
83759,zuppa di pesce castagna,392620,70,62264,...,68.0,15.0,6.0,True
83776,zydeco sauce,357451,15,461283,...,1.0,14.0,5.0,True
83777,zydeco soup,486161,60,227978,...,44.0,21.0,15.0,True


In [183]:
american['calories']

0        138.4
1        595.1
2        194.8
         ...  
83779     59.2
83780    188.0
83781    174.9
Name: calories, Length: 83782, dtype: object

In [210]:
american['total_fat'].hist(nbins=20)

In [193]:
american = recipes3.copy()
american['is_american'] = recipes3['tags_list'].apply(tag_contains, name='american')
# print(american.head())
px.histogram(american, x='total_fat', nbins = 20, title='Average Ratings Recipes')

In [151]:
recipes3[recipes3['name'].str.lower().str.contains('filipino')]

Unnamed: 0,name,id,minutes,contributor_id,...,sodium,protein,saturated_fat,carbohydrates
1024,adobo style shrimp filipino,374461,11,544754,...,201.0,97.0,12.0,4.0
11344,buko salad filipino young coconut fruit salad,306016,10,385678,...,3.0,12.0,86.0,17.0
12401,camaron rebosado filipino fried shrimp,527455,42,314579,...,52.0,40.0,110.0,5.0
...,...,...,...,...,...,...,...,...,...
76570,the filipino elvis sandwich,527485,13,135470,...,36.0,12.0,221.0,18.0
80329,vegetarian pansit noodles filipino,305777,50,385678,...,26.0,16.0,14.0,20.0
80358,vegetarian sinigang filipino tamarind or sour...,313263,35,602448,...,7.0,29.0,3.0,11.0


In [None]:
recipes3['tags_list'].count()

np.int64(83782)

In [None]:
recipes3['ingredients'].iloc[0]

"['bittersweet chocolate', 'unsalted butter', 'eggs', 'granulated sugar', 'unsweetened cocoa powder', 'vanilla extract', 'brewed espresso', 'kosher salt', 'all-purpose flour']"

In [98]:
# all_tags = recipes2['tags_list'].sum()

## Step 2: Data Cleaning and Exploratory Data Analysis

In [99]:
# TODO

## Step 3: Assessment of Missingness

In [100]:
# TODO

## Step 4: Hypothesis Testing

In [101]:
# TODO

## Step 5: Framing a Prediction Problem

In [102]:
# TODO

## Step 6: Baseline Model

In [103]:
# TODO

## Step 7: Final Model

In [104]:
# TODO

## Step 8: Fairness Analysis

In [105]:
# TODO