# Your Title Here

**Name(s)**: Colin Czarnik and Arthur Yang

**Website Link**: (your website link)

In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * # Feel free to uncomment and use this. It'll make your plotly graphs look like ours in lecture!

## Step 1: Introduction

In [2]:
# TODO
recipes = pd.read_csv('RAW_recipes.csv')
interactions = pd.read_csv('RAW_interactions.csv')


In [3]:
(recipes['submitted'] > '2008').sum()

83782

In [4]:
interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [27]:
# merge tables on recipe id
merged = recipes.merge(interactions, how='left', left_on="id", right_on="recipe_id")
merged.shape

(1132367, 17)

In [6]:
merged.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients', 'user_id', 'recipe_id', 'date', 'rating', 'review'],
      dtype='object')

In [7]:
# this shows ratings of 0 mean that the person didn't rate the recipe, not that thy hated it.
merged[merged['rating']== 0]['review'].iloc[10]

"Skipped the onion, salt, and croutons. Topped with a little bit of the cheese instead. Squeezed out the spinach VERY well. Very easy to throw together. The texture was good, but I am sorry to say that this was a flop. I thought it was edible (only edible) and my DH absolutely hated it.  I won't give a star rating because of the ingredients I skipped, but I also won't make this again."

In [8]:
merged['rating'] = merged['rating'].replace(0, np.nan)
merged.head()

Unnamed: 0,name,id,minutes,contributor_id,...,recipe_id,date,rating,review
0,arriba baked winter squash mexican style,137739,55,47892,...,137739,2006-02-18,5.0,I used an acorn squash and recipe#137681 Swee...
1,arriba baked winter squash mexican style,137739,55,47892,...,137739,2010-08-21,5.0,This was a nice change. I used butternut squas...
2,arriba baked winter squash mexican style,137739,55,47892,...,137739,2011-12-05,5.0,Excellent recipe! I used butternut squash and ...
3,a bit different breakfast pizza,31490,30,26278,...,31490,2002-08-19,,"Have not tried this, but it sounds delicious. ..."
4,a bit different breakfast pizza,31490,30,26278,...,31490,2006-08-27,5.0,This recipe was wonderful. Instead of using t...


In [9]:
avg_ratings = merged.groupby('recipe_id')['rating'].mean().sort_values()
avg_ratings = avg_ratings.to_frame().rename(columns={'rating':'avg_rating'})
avg_ratings.head()

Unnamed: 0_level_0,avg_rating
recipe_id,Unnamed: 1_level_1
433731,1.0
33235,1.0
145547,1.0
180823,1.0
132995,1.0


In [10]:
merged = merged.merge(avg_ratings,on='recipe_id')
merged.head()

Unnamed: 0,name,id,minutes,contributor_id,...,date,rating,review,avg_rating
0,arriba baked winter squash mexican style,137739,55,47892,...,2006-02-18,5.0,I used an acorn squash and recipe#137681 Swee...,5.0
1,arriba baked winter squash mexican style,137739,55,47892,...,2010-08-21,5.0,This was a nice change. I used butternut squas...,5.0
2,arriba baked winter squash mexican style,137739,55,47892,...,2011-12-05,5.0,Excellent recipe! I used butternut squash and ...,5.0
3,a bit different breakfast pizza,31490,30,26278,...,2002-08-19,,"Have not tried this, but it sounds delicious. ...",4.67
4,a bit different breakfast pizza,31490,30,26278,...,2006-08-27,5.0,This recipe was wonderful. Instead of using t...,4.67


## Step 2: Data Cleaning and Exploratory Data Analysis

In [11]:
# TODO
merged['steps'].iloc[0]

"['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking', 'if desired , season with salt']"

In [12]:
import re
# convert a string resembling a list of floats into an actual list of floats (for nutrition column)
def string_to_float_list(s):
    return [float(i) for i in re.findall('\d+\.\d', s)]

# convert a string resembling a list of strings into a list of strings (for tags and steps columns)
def string_to_string_list(s):
    return re.findall('\'(.+?)\'', s)


In [13]:
# TODO
merged['nutrition'] = merged['nutrition'].apply(string_to_float_list)
# merged['nutrition']

In [14]:
merged['steps'] = merged['steps'].apply(string_to_string_list)
merged['tags'] = merged['tags'].apply(string_to_string_list)

In [15]:
merged[merged['minutes'] == 2147483647]

Unnamed: 0,name,id,minutes,contributor_id,...,date,rating,review,avg_rating
713702,no bake granola balls,261647,2147483647,464080,...,2008-03-18,5.0,"A great treat for begetarians, as my son & his...",5.0
713703,no bake granola balls,261647,2147483647,464080,...,2008-05-23,5.0,It is so nice to have a homemade snack recipe ...,5.0
713704,no bake granola balls,261647,2147483647,464080,...,2008-08-18,5.0,Made for Aus/NZ - Make My Recipe Edition 5. T...,5.0
...,...,...,...,...,...,...,...,...,...
713718,no bake granola balls,261647,2147483647,464080,...,2014-05-20,5.0,"Really good, and healthy too!",5.0
713719,no bake granola balls,261647,2147483647,464080,...,2017-03-06,5.0,Guys don't try this recipe! When I was making ...,5.0
713720,no bake granola balls,261647,2147483647,464080,...,2017-06-28,5.0,AMAZING!!! like I love it so much imam have it...,5.0


In [16]:
# remove outlier recipe with an extremely high minutes value
merged = merged.drop(range(713702, 713721))
merged[merged['minutes'] == 2147483647]

Unnamed: 0,name,id,minutes,contributor_id,...,date,rating,review,avg_rating


In [17]:
merged[merged['minutes'] == 1051200]

Unnamed: 0,name,id,minutes,contributor_id,...,date,rating,review,avg_rating
538592,how to preserve a husband,447963,1051200,576273,...,2011-03-10,5.0,I'd thought that I would like to keep mine in ...,5.0
538593,how to preserve a husband,447963,1051200,576273,...,2011-04-08,5.0,"No matter if you've got the basic, no-frills m...",5.0


In [18]:
merged = merged.drop(range(538592, 538594))
merged[merged['minutes'] == 1051200]

Unnamed: 0,name,id,minutes,contributor_id,...,date,rating,review,avg_rating


In [19]:
merged = merged.assign(calories=merged['nutrition'].str[0])

In [20]:
merged.groupby('recipe_id')[['avg_rating','calories']].mean().sort_values("calories")
merged.loc[1013367,'review']

"That's ridiculous, this doesn't qualify as a recipe! It's cryptic, & assumes the reader can fill in the blanks. If you can't take the heat..... step away from the still!!!"

In [21]:
fig = px.histogram(merged.groupby('recipe_id')['avg_rating'].mean(), x='avg_rating', nbins=20)
#fig.show()

In [22]:
fig1 = px.scatter(merged.groupby('recipe_id')[['avg_rating','calories']].mean(), x='calories', y='avg_rating', range_x=[0, 50000], title='Average Rating vs. Calories')
# fig1.show()

In [23]:
fig1 = px.scatter(merged.groupby('recipe_id')[['avg_rating','minutes']].mean(), x='minutes', y='avg_rating', title='Average Rating vs. Minutes')
# fig1.show()

## Step 3: Framing a Prediction Problem

In [24]:
# TODO

In [25]:
# TODO

## Step 5: Final Model

In [26]:
# TODO