In [10]:
import json

import numpy as np
import pandas as pd
import pymongo
import scipy
from bson import ObjectId, json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient as Connection
from scipy import sparse, spatial
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

# scientific notation disabled form smaller numbers
pd.options.display.float_format = '{:.5f}'.format

# alles resultate anzeigen und nicht nur das letzte
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [11]:
# display multiple outputs in one row
import pandas as pd
import numpy as np
from IPython.display import display, HTML

CSS = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(CSS))

In [12]:
connection = Connection()
db = connection.recipe_db
input_data = db.recipes_test_copy

data = json.loads(json_util.dumps(input_data.find()))

In [13]:
ing = pd.DataFrame(json_normalize(data, record_path='ingredients',
                             meta='id', record_prefix='ingredients_', errors='ignore'))


nutritions = pd.DataFrame(json_normalize(data, record_path='nutritions',
                            meta=['id', 'prep_time', 'rating', 'rating_count', 'ready_in_time', 'review_count']))


In [14]:
#  ------  erstellung und data cleansing - Ingredients

# schmeiss alle zutaten raus, die weniger als 5 mal verwendet werden
# setze multiinde auf 'id' und 'ingredients_id'
ingredients = ing.set_index(['id', 'ingredients_id'])

# filtere alle Zutaten samit ihrer rezepte id, die weniger gleich 5 mal vorkommen
ingredients_eqles_5_ing = ingredients.groupby(
    'ingredients_id').filter(lambda x: len(x) <= 5)

# droppe alle rezepte, die eine Zutate besitzen, die weniger gleich 5 Mal vorkommen
ingredients_filt = ingredients.drop(ingredients_eqles_5_ing.index.get_level_values('id').values, level=0)

# drop alls rows with ingredients_id == 0
ingredients_eqal_zero = ingredients_filt[ingredients_filt.index.get_level_values('ingredients_id') == 0]
ingredients_filt = ingredients_filt.drop(ingredients_eqal_zero.index.get_level_values('id').values, level=0)



In [15]:
ingredients_filt[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,ingredients_name,ingredients_grams,ingredients_type
id,ingredients_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
59661,16157,10 g butter,11.36,Normal
59661,4405,40 g sliced green onions,41.8,Normal
59661,4342,"1-1/2 cloves garlic, minced",4.8,Normal
59661,4520,3/4 (10 ounce) package frozen chopped spinach ...,227.2,Normal
59661,16243,180 g ricotta cheese,182.40001,Normal
59661,16261,90 g sour cream,92.0,Normal
59661,16231,180 g shredded Monterey Jack cheese,180.8,Normal
59661,2351,8 (6 inch) corn tortillas,208.0,Normal
59661,7431,3/4 (19 ounce) can enchilada sauce,425.6,Normal
16330,3103,305 g ground beef,302.6667,Normal


In [16]:
#  ------ Erstellung und cleansing des Nutrition Dataframes

# erstelle neue liste auf basis der bereits gefilterten rezepte aus ingredients_filt
id_overlap_mask = nutritions['id'].isin(ingredients_filt.index.get_level_values('id').values)

# erstelle datenframe auf basis der overlapliste
nutritions_filt = nutritions.loc[id_overlap_mask]

nutrition_db = nutritions_filt.pivot_table(
    index=['id'],
    columns=['name'],
    values=['amount'],
).reset_index()

nutrition_db.set_index('id', inplace=True)

# remove multiindex 'amount'
nutrition_db.columns = nutrition_db.columns.droplevel(0)

# entferne alle NA
nutrition_db = nutrition_db.dropna()

# gleiche nochmals die ids der beiden dataframe nutrition und ingredients ab, da der nutrition dataframe noch NA Werte hatt
id_overlap_mask = ingredients_filt.index.get_level_values('id').isin(nutrition_db.index)
ingredients_db = ingredients_filt[id_overlap_mask]

# abgleich ob anzahl der indizes von nutrition und zutaten dataframe gleich sind
nutrition_db.index.nunique()
ingredients_db.index.get_level_values('id').nunique()

4062

4062

In [17]:
id_list = [215014, 8669, 16700, 16354, 12720, 8652, 8887, 51283, 45954, 213742, 14595,
           219164, 16348, 143082, 8665, 11758, 223042, 236609, 8778, 65896, 24264, 11679, 141678, 9023]

ingredients_db.reset_index(inplace=True)

recipe_db = pd.get_dummies(ingredients_db['ingredients_id']).groupby(
    ingredients_db['id']).apply(max)

In [28]:
ingredients_db

Unnamed: 0,id,ingredients_id,ingredients_name,ingredients_grams,ingredients_type
0,59661,16157,10 g butter,11.36000,Normal
1,59661,4405,40 g sliced green onions,41.80000,Normal
2,59661,4342,"1-1/2 cloves garlic, minced",4.80000,Normal
3,59661,4520,3/4 (10 ounce) package frozen chopped spinach ...,227.20000,Normal
4,59661,16243,180 g ricotta cheese,182.40001,Normal
...,...,...,...,...,...
36691,229659,16423,"1-3/4 sprigs fresh thyme, divided",0.66667,Normal
36692,229659,20270,"3/8 white onion, chopped - divided",36.66667,Normal
36693,229659,16157,"40 g butter, sliced into pats",37.83334,Normal
36694,229659,4292,80 g chopped fresh celery leaves,80.00000,Normal


In [19]:
test = ingredients_db.copy()

In [20]:
# Funktion um das Gewicht der einzelnen Zutaten von vier portionen auf 1 portionen zu erhöhen
def portion_to_1(x):
    a = x / 4
#     a = a * 2
#     x = x + a
    return a


#rechne 
test['ingredients_grams'] = test['ingredients_grams'].apply(
    portion_to_1)


In [21]:
recipe_summed_weight = test.groupby(['id']).sum()
# asd2
recipe_summed_weight = recipe_summed_weight.drop(columns='ingredients_id')

# recipe_summed_weight
# recipe_summed_weight

recipe_summed_weight = recipe_summed_weight.sort_values(by='ingredients_grams')

recipe_summed_weight

Unnamed: 0_level_0,ingredients_grams
id,Unnamed: 1_level_1
20881,5.30556
79255,18.32336
106030,19.78482
34531,22.82192
12063,26.77722
...,...
55946,1177.84896
229874,1182.75000
51849,1195.10222
232865,1444.82829


In [22]:
nutrition_db = nutrition_db.join(recipe_summed_weight)

In [23]:
# rechne nährwerte auf 1000g pro rezept hoch
def nutrition_to_1000(x):
    a = x / x.ingredients_grams
    b = a * 1000
    return b
#     print(x.ingredients_grams)


In [24]:
nutrition_db_1000 = nutrition_db.apply(nutrition_to_1000, axis = 1)


In [25]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print df

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(df)? (<ipython-input-25-8d856d6756d2>, line 3)

In [None]:
df2 = pd.DataFrame(data=None, columns=nutrition_db_1000.columns, index=None)
df2 = df2.drop(columns='ingredients_grams')
df2.T

In [None]:

df2.loc['index'] = [
    # "Calcium":[1],
    1,
    # "Calories":[1],
    800,
    # "Calories from Fat":[1],
    1,
    # "Carbohydrates":[1],
    1,
    # "Cholesterol":[1],
    1,
    # "Dietary Fiber	":[1],
    1,
    # "Fat":[1],
    1,
    # "Folate":[1],
    1,
    # "Iron":[1],
    1,
    # "Magnesium":[1],
    1,
    # "Niacin Equivalents":[1],
    1,
    # "Potassium":[1],
    1,
    # "Protein":[1],
    1,
    # "Saturated Fat":[1],
    1,
    # "Sodium":[1],
    1,
    # "Sugars":[1],
    1,
    # "Thiamin":[1],
    1,
    # "Vitamin A - IU":[1],
    1,
    # "Vitamin B6":[1],
    1,
    # "Vitamin C":[1]}
    2]
# df2 = df2.iloc[1:]
df2
