In [1]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')
import numpy as np
import re

In [2]:
# pull data
df_recipes = pd.read_json ('full_format_recipes.json')

In [3]:
# see all records in datafrae
# df_recipes.head()

In [4]:
# drop nan records
df = df_recipes.dropna()
# df.shape[0]

In [5]:
# let's change the categories header to tags
df.rename(columns = {'categories': 'tags'}, inplace=True)

In [6]:
# let's add a column for count of ingredients
df['ingredients_count'] = df['ingredients'].apply(lambda x: len(x))

In [7]:
# let's add a column for count of tags
df['tags_count'] = df['tags'].apply(lambda x: len(x))

In [8]:
#take a peak
# df.head()

In [9]:
# describe data
# df.describe()

In [10]:
# some calorie values are out of control! no way it's real
# so let's find the outliers and then remove them by using the IQR range

q1  = df['calories'].quantile(.25)
q3  = df['calories'].quantile(.75)
iqr = q3 - q1

df = df.loc[(df['calories'] > q1-(iqr*3)) & (df['calories'] < q3+(iqr*3))]
# df.shape

In [11]:
# remove anything with zero calories
df.drop(df[df['calories']==0].index, inplace=True)
# df.shape[0]

In [12]:
# function to get normalized rating/define and bins
def get_normalized_rating(rating):
    if rating <1:
        return 0
    elif rating <2:
        return 1
    elif rating <3:
        return 2
    elif rating <4:
        return 3
    else:
        return 4

In [13]:
# create a normalized rating column
df['rating_normalized'] = df['rating'].apply(get_normalized_rating)

In [14]:
# epifactor calculation 
def get_epifactor(min_val, max_val, mean_val):
    return ((mean_val - min_val) / (max_val - min_val)) * 100

In [15]:
# create calorie epifactor dataframe
df_calorie_epifactor = df.groupby('rating_normalized')['calories'].agg(['min', 'max', 'mean'])
df_calorie_epifactor['epifactor'] = np.vectorize(get_epifactor)(df_calorie_epifactor['min'], df_calorie_epifactor['max'], df_calorie_epifactor['mean'])
df_calorie_epifactor

Unnamed: 0_level_0,min,max,mean,epifactor
rating_normalized,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.0,1822.0,310.892857,17.01773
1,19.0,1591.0,353.134921,21.255402
2,14.0,1496.0,384.475472,24.998345
3,1.0,1821.0,429.288802,23.532352
4,2.0,1829.0,460.248154,25.082001


In [16]:
# create fat epifactor dataframe

# np.vectorize (allows you to execute your function with numpy?)

df_fat_epifactor = df.groupby('rating_normalized')['fat'].agg(['min', 'max', 'mean'])
df_fat_epifactor['epifactor'] = np.vectorize(get_epifactor)(df_fat_epifactor['min'], df_fat_epifactor['max'], df_fat_epifactor['mean'])
# df_fat_epifactor

In [17]:
#create protein epifactor dataframe
df_protein_epifactor = df.groupby('rating_normalized')['protein'].agg(['min', 'max', 'mean'])
df_protein_epifactor['epifactor'] = np.vectorize(get_epifactor)(df_protein_epifactor['min'], df_protein_epifactor['max'], df_protein_epifactor['mean'])
# df_protein_epifactor

In [18]:
#create ingredients epifactor dataframe
df_ingredients_epifactor = df.groupby('rating_normalized')['ingredients_count'].agg(['min', 'max', 'mean'])
df_ingredients_epifactor['epifactor'] = np.vectorize(get_epifactor)(df_ingredients_epifactor['min'], df_ingredients_epifactor['max'], df_ingredients_epifactor['mean'])
# df_ingredients_epifactor

In [19]:
# create lists of all epifactors
calorie = df_calorie_epifactor['epifactor'].round(2)
fat = df_fat_epifactor['epifactor'].round(2)
protein = df_fat_epifactor['epifactor'].round(2)
ingredients = df_ingredients_epifactor['epifactor'].round(2)

# add lists into one finale epifactor dataframe
df_epifactor = pd.DataFrame(list(zip(calorie, fat, protein, ingredients)), 
               columns =['Calories', 'Fat', 'Protein', 'Ingredients']) 

# df_epifactor.rename(columns={'index':'names'}, inplace=True)
df_epifactor

Unnamed: 0,Calories,Fat,Protein,Ingredients
0,17.02,10.67,10.67,22.84
1,21.26,21.68,21.68,35.71
2,25.0,17.6,17.6,28.36
3,23.53,15.88,15.88,30.36
4,25.08,15.85,15.85,19.63


In [20]:
radar_dict_test = [{'name': 'Calories', 'value': df_epifactor['Calories'][0]},
                   {'name': 'Fat', 'value': df_epifactor['Fat'][0]},
                   {'name': 'Protein', 'value': df_epifactor['Protein'][0]},
                   {'name': 'Ingredients', 'value': df_epifactor['Ingredients'][0]},
                   {'name': 'Nothing', 'value': 0.2}]  

# i guess "nothing" is in here because the math only works with five values?

radar_dict_test

[{'name': 'Calories', 'value': 17.02},
 {'name': 'Fat', 'value': 10.67},
 {'name': 'Protein', 'value': 10.67},
 {'name': 'Ingredients', 'value': 22.84},
 {'name': 'Nothing', 'value': 0.2}]

In [34]:
# create string for word cloud
word_list = []
df[df.tags.notnull()].tags.map(lambda x: word_list.extend(x))
len(word_list)

127883

In [32]:
#save to a dataframe
word_list = pd.DataFrame(word_list)

# export data to csv
word_list.to_csv('word_list.csv', index=False)
word_list.head()

Unnamed: 0,0
0,Food Processor
1,Onion
2,Pork
3,Bake
4,Bastille Day


In [None]:
def parse_trend_title_tag(title, tags, trend):
    title = title.lower()
    tags = [x.lower() for x in tags]
    
    if re.search(trend, title):
        return 1;
    else:
        if any(trend in t for t in tags):
            return 1;
        else:
            return 0;


In [None]:
# test the function
# parse_trend_tag("spinach Pie", ['one', 'pie', 'Vegetarian'], 'Vegan/Vegetarian')

# tags = ['One two', 'TWO']
# tags = [x.lower() for x in tags]
# print(tags)

# parse_trend_title_tag('title', tags, 'three')

In [None]:
df['vegan'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'vegan'), axis=1)
df['vegetarian'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'vegetarian'), axis=1)
df['organic'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'organic'), axis=1)
df['healthy'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'healthy'), axis=1)
df['quick'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'quick'), axis=1)
df['low_carb'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'low carb'), axis=1)
df['low_fat'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'low fat'), axis=1)
df['pescatarian'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'pescatarian'), axis=1)
df['fat_free'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'fat free'), axis=1)
df['high_fiber'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'high fiber'), axis=1)
df['kid_friendly'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'kid-friendly'), axis=1)
df['kosher'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'kosher'), axis=1)
df['low_cal'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'low cal'), axis=1)
df['no_cook'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'no-cook'), axis=1)
df['paleo'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'paleo'), axis=1)
df['picnic'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'picnic'), axis=1)
df['easy'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'easy'), axis=1)
df['slow_cooker'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'slow cooker'), axis=1)
df['gluten_free'] = df.apply(lambda row : parse_trend_title_tag(row['title'], row['tags'], 'gluten-free'), axis=1)

In [None]:
# export data to csv
df.to_csv('epicuriousity.csv', index=False)

In [None]:
# let's see how those tags value up
df.aggregate({'vegan':['sum'], 'vegetarian': ['sum'], 'organic':['sum'], 'healthy':['sum'], 'quick': ['sum'],
             'low_carb': ['sum'], 'low_fat': ['sum'], 'pescatarian': ['sum'], 'fat_free': ['sum'], 
             'high_fiber': ['sum'], 'kid_friendly': ['sum'], 'kosher': ['sum'], 'low_cal': ['sum'],
             'no_cook': ['sum'], 'paleo': ['sum'], 'picnic': ['sum'], 'easy': ['sum'], 'slow_cooker': ['sum'],
             'gluten_free': ['sum']})