In [94]:
import pandas as pd
import json 
import os
import codecs
import numpy as np

In [95]:
def split_data_frame_list(df, target_column):
    """
    Splits a column with lists into rows
    
    Keyword arguments:
        df -- dataframe
        target_column -- name of column that contains lists        
    """
    
    # create a new dataframe with each item in a seperate column, dropping rows with missing values
    col_df = pd.DataFrame(df[target_column].dropna().tolist(),index=df[target_column].dropna().index)

    # create a series with columns stacked as rows         
    stacked = col_df.stack()

    # rename last column to 'idx'
    index = stacked.index.rename(names="ingr_id", level=-1)
    new_df = pd.DataFrame(stacked, index=index, columns=[target_column])
    return new_df

def parseIngredients(ingredientStr):
    
    ingredientStr = ingredientStr.replace('Prise(n)','g')
    ingredientStr = ingredientStr.replace('1.00 Pck.','7.00 g')
    ingredientStr = ingredientStr.replace('1.00 Pkt.','7.00 g')
    ingredientStr = ingredientStr.replace('2.00 Pck.','14.00 g')
    ingredientStr = ingredientStr.replace('1.00 EL','15.00 g')
    ingredientStr = ingredientStr.replace('2.00 EL','30.00 g')
    ingredientStr = ingredientStr.replace('Spritzer',' g')
    
    ingredientStr = ingredientStr.replace('Puderzucker','Zucker')    
    ingredientStr = ingredientStr.replace('Ei(er)','Eier')

    return ingredientStr

In [96]:
recipe_values = []

for file in os.listdir("../data/html"):
    if file.endswith(".html"):
        recipe_id = file.split('.html')[0]
        html_doc = os.path.join("../data/html", file)
        html_string = codecs.open(html_doc,'r', encoding="utf8").read()
        
        recipe_json = html_string.split('<script type="application/ld+json">')[2].split('</script>')[0]
        recipe_dict = json.loads(recipe_json)
        
        try:
            rating = float(recipe_dict['aggregateRating']['ratingValue'])
        except:
            rating = None
        
        try:
            ingredients = recipe_dict['recipeIngredient']
            ingredients = [i.replace(',', '.') for i in ingredients]
        except Exception as r:
            print(r)
            ingredients = []
        
        values = (recipe_id, rating, ingredients)
        recipe_values.append(values)
        
df = pd.DataFrame(recipe_values)
df.columns = ['recipe_id','rating','ingredients']
df = df.set_index('recipe_id')

In [100]:
# convert ingredients column to rows
df_ingredients = split_data_frame_list(df, target_column="ingredients")
df_ingredients = df_ingredients.reset_index()
df = df.drop(['ingredients'], axis=1)

KeyError: 'ingredients'

In [101]:
# merge dataframes
df = df.reset_index()
df_merge = df.merge(df_ingredients, how='outer', on='recipe_id')

# drop recipes without rating
df_merge = df_merge.dropna(subset = ['rating'])

df_merge['ingredients'] = df_merge['ingredients'].apply(parseIngredients)

df_merge.head()

Unnamed: 0,index,recipe_id,rating,ingr_id,ingredients
0,0,1001121205401243,3.15,0,0.25 Liter Milch
1,0,1001121205401243,3.15,1,2.00 Eier
2,0,1001121205401243,3.15,2,40.00 g Zucker
3,0,1001121205401243,3.15,3,1.00 g Salz
4,0,1001121205401243,3.15,4,0.50 TL Zimt . ca.


In [102]:
df_ingredient_count = df_merge.groupby(['ingredients']).count().reset_index()
df_ingredient_count = df_ingredient_count.sort_values('index', ascending=False)
df_ingredient_count.head(n=100)

Unnamed: 0,ingredients,index,recipe_id,rating,ingr_id
783,1.00 g Salz,205,205,205,205
2375,7.00 g Vanillezucker,140,140,140,140
1605,250.00 g Mehl,126,126,126,126
1951,4.00 Eier,107,107,107,107
1683,3.00 Eier,102,102,102,102
657,1.00 TL Backpulver,100,100,100,100
1306,2.00 Eier,90,90,90,90
958,100.00 g Zucker,57,57,57,57
446,0.50 TL Backpulver,54,54,54,54
2181,50.00 g Zucker,54,54,54,54
