In [35]:
import pandas as pd
import json 
import os
import codecs
import numpy as np

In [36]:
def split_data_frame_list(df, target_column):
    """
    Splits a column with lists into rows
    
    Keyword arguments:
        df -- dataframe
        target_column -- name of column that contains lists        
    """
    
    # create a new dataframe with each item in a seperate column, dropping rows with missing values
    col_df = pd.DataFrame(df[target_column].dropna().tolist(),index=df[target_column].dropna().index)

    # create a series with columns stacked as rows         
    stacked = col_df.stack()

    # rename last column to 'idx'
    index = stacked.index.rename(names="ingr_id", level=-1)
    new_df = pd.DataFrame(stacked, index=index, columns=[target_column])
    return new_df

def parseIngredients(ingredientStr):
    ingredientStr = ingredientStr.replace(',','.')
    ingredientStr = ingredientStr.replace('(','').replace(')','')
    ingredientStr = ingredientStr.replace('.00','')
    ingredientStr = ingredientStr.strip()
    
    ingredientStr = ingredientStr.replace('Prisen','g')
    ingredientStr = ingredientStr.replace('Spritzer',' g')
    
    ingredientStr = ingredientStr.replace('1 Eier','65 g Ei')
    ingredientStr = ingredientStr.replace('2 Eier','130 g Ei')
    ingredientStr = ingredientStr.replace('3 Eier','195 g Ei')
    ingredientStr = ingredientStr.replace('4 Eier','260 g Ei')
    ingredientStr = ingredientStr.replace('5 Eier','325 g Ei')
    
    ingredientStr = ingredientStr.replace('1 Pck.','7.00 g')
    ingredientStr = ingredientStr.replace('1 Pkt.','7.00 g')
    ingredientStr = ingredientStr.replace('2 Pck.','14.00 g')
    ingredientStr = ingredientStr.replace('0.50 Pck.','3.50 g')
    
    ingredientStr = ingredientStr.replace('1 EL','15.00 g')
    ingredientStr = ingredientStr.replace('2 EL','30.00 g')
    
    ingredientStr = ingredientStr.replace('Puderzucker','Zucker')
    ingredientStr = ingredientStr.replace('Vanillezucker','Zucker')
    
    ingredientStr = ingredientStr.replace('1 TL','3 g')   
    
    return ingredientStr

In [37]:
recipe_values = []

for file in os.listdir("../data/html"):
    if file.endswith(".html"):
        recipe_id = file.split('.html')[0]
        html_doc = os.path.join("../data/html", file)
        html_string = codecs.open(html_doc,'r', encoding="utf8").read()
        
        recipe_json = html_string.split('<script type="application/ld+json">')[2].split('</script>')[0]
        recipe_dict = json.loads(recipe_json)
        
        try:
            rating = float(recipe_dict['aggregateRating']['ratingValue'])
        except:
            rating = None
        
        try:
            ingredients = recipe_dict['recipeIngredient']
        except Exception as r:
            print(r)
            ingredients = []
        
        values = (recipe_id, rating, ingredients)
        recipe_values.append(values)
        
df = pd.DataFrame(recipe_values)
df.columns = ['recipe_id','rating','ingredients']
df = df.set_index('recipe_id')

In [38]:
# convert ingredients column to rows
df_ingredients = split_data_frame_list(df, target_column="ingredients")
df_ingredients = df_ingredients.reset_index()
df = df.drop(['ingredients'], axis=1)

In [39]:
# merge dataframes
df = df.reset_index()
df_merge = df.merge(df_ingredients, how='outer', on='recipe_id')

# drop recipes without rating
df_merge = df_merge.dropna(subset = ['rating'])

df_merge['ingredients'] = df_merge['ingredients'].apply(parseIngredients)

In [41]:
df_merge.head(40)

Unnamed: 0,recipe_id,rating,ingr_id,ingredients
0,1001121205401243,3.15,0,0.25 Liter Milch
1,1001121205401243,3.15,1,2 Eier
2,1001121205401243,3.15,2,40 g Zucker
3,1001121205401243,3.15,3,1 g Salz
4,1001121205401243,3.15,4,0.50 TL Zimt . ca.
5,1001121205401243,3.15,5,180 g Mehl
6,1001121205401243,3.15,6,7 g Backpulver
7,1001121205401243,3.15,7,3 m.-große Äpfel . geschält. entkernt. gerieben
8,1001121205401243,3.15,8,etwas Öl für das Waffeleisen
9,1003191205518741,2.6,0,4 Eigelb


In [42]:
df_ingredient_count = df_merge.groupby(['ingredients']).count().reset_index()
df_ingredient_count = df_ingredient_count.sort_values('recipe_id', ascending=False)
df_ingredient_count.head(n=100)

Unnamed: 0,ingredients,recipe_id,rating,ingr_id
335,1 g Salz,205,205,205
2014,7 g Zucker,140,140,140
1152,250 g Mehl,126,126,126
1585,4 Eier,107,107,107
1230,3 Eier,102,102,102
1319,3 g Backpulver,100,100,100
856,2 Eier,90,90,90
510,100 g Zucker,57,57,57
1812,50 g Zucker,54,54,54
85,0.50 TL Backpulver,54,54,54
