In [117]:
import pandas as pd
import numpy as np
import json


## Exercise 1

First we load and take a look at the dataset

In [126]:
with open("data/dessert.json") as json_file: 
    dessert_json = json.load(json_file)
dessert_data = pd.read_json("data/dessert.json")
dessert_data_port = pd.json_normalize(dessert_json, record_path=["portions"],meta=["id","type"],meta_prefix="origin-") # flatten json object into separate df
dessert_data_nutr = pd.json_normalize(dessert_json, record_path=["nutrients"],meta=["id","type"], meta_prefix="origin-") # flatten json object into separate df
dessert_data_nutr.sample(10)



Unnamed: 0,value,units,description,type,origin-id,origin-type
14852,18.3,g,Total lipid (fat),Composition,18355,Sweet Bread
18946,0.0,mg,Cholesterol,Other,18497,Toaster Pastries
15755,54.92,g,Starch,Sugars,18376,Savory Bread
21125,0.081,g,Isoleucine,Amino Acids,18944,Cake
19493,64.78,g,"Carbohydrate, by difference",Composition,18529,Cookies
8796,0.0,g,Maltose,Sugars,18209,Cookies
3104,89.0,mg,"Magnesium, Mg",Elements,18078,Sweet Bread
12651,4.73,mg,"Iron, Fe",Elements,18295,Cake
7452,7.0,mcg,"Folate, food",Vitamins,18183,Cookies
15813,0.318,g,Histidine,Amino Acids,18376,Savory Bread


### Exercise 1a)

We decided to replace empty Strings and `None` values with the string "No manufacturer" for the **Manufacturer** column:

In [119]:
dessert_data.replace("", None, inplace=True)
dessert_data.fillna(value="no Manufacturer",inplace=True)
dessert_data["manufacturer"].sample(40) 

124         no Manufacturer
163         no Manufacturer
69          no Manufacturer
205         no Manufacturer
178         no Manufacturer
181         no Manufacturer
223         no Manufacturer
369         H.J. Heinz, Co.
359    The J.M. Smucker Co.
349         Archway Cookies
176         no Manufacturer
380         no Manufacturer
247         no Manufacturer
256         no Manufacturer
322         Archway Cookies
14          no Manufacturer
267         no Manufacturer
265         no Manufacturer
361           Pillsbury Co.
271         no Manufacturer
179         no Manufacturer
91          no Manufacturer
93          no Manufacturer
115         no Manufacturer
196         no Manufacturer
200         no Manufacturer
300            Kellogg, Co.
78          no Manufacturer
373         no Manufacturer
319         Archway Cookies
303            Kellogg, Co.
383         no Manufacturer
195         no Manufacturer
137         no Manufacturer
243         no Manufacturer
270         no Manuf

Next, we need to extract the contents of the lists in the **tags** column. We define a function to print the content of the list and return it:

In [120]:
def print_list(my_list):
    if len(my_list) > 0:
        count = 0
        for x in my_list:
            print(f"list value of element {count}: {x}")
            count += 1
    return my_list
dessert_data["tags"].map(print_list)

list value of element 0: Include commodity code B367
list value of element 0: Include commodity code B368
list value of element 0: Latino food
list value of element 0: Latino food


0                 []
1                 []
2                 []
3                 []
4                 []
           ...      
387               []
388               []
389               []
390    [Latino food]
391    [Latino food]
Name: tags, Length: 392, dtype: object

As you can see, we have only 4 list entries actually containing any tags in the entire column. And for those entries, the list has a length of 1, which means we can use a similiar function to extract the values from the list and return them as a string to the dataframe:

In [121]:
def extract_list_ele(my_list): #returns first content of list if non empty and "no Tags" otherwise
    if type(my_list) == list: # check if we are working on lists
        if len(my_list) == 0:
            return "no Tags"
        else:
            return my_list[0]
    else: #if not list, do nothing
        return my_list

dessert_data["tags"]=dessert_data["tags"].map(extract_list_ele)

dessert_data.sample(10)

Unnamed: 0,id,description,tags,manufacturer,type,portions,nutrients
358,18614,"MARTHA WHITE FOODS, Martha White's Chewy Fudge...",no Tags,The J.M. Smucker Co.,Cake,"[{'amount': 1, 'unit': 'serving', 'grams': 28.0}]","[{'value': 4.42, 'units': 'g', 'description': ..."
165,18258,"English muffins, plain, enriched, with ca prop...",no Tags,no Manufacturer,Savory Bread,"[{'amount': 1, 'unit': 'oz', 'grams': 28.35}, ...","[{'value': 8.87, 'units': 'g', 'description': ..."
343,18546,"ARCHWAY Home Style Cookies, Soft Sugar Drop",no Tags,Archway Cookies,Cookies,[],"[{'value': 4.82, 'units': 'g', 'description': ..."
137,18210,"Cookies, vanilla sandwich with creme filling",no Tags,no Manufacturer,Cookies,"[{'amount': 1, 'unit': 'oz', 'grams': 28.35}, ...","[{'value': 4.5, 'units': 'g', 'description': '..."
134,18206,"Cookies, sugar, refrigerated dough, baked",no Tags,no Manufacturer,Cookies,"[{'amount': 1, 'unit': 'oz', 'grams': 28.35}, ...","[{'value': 4.7, 'units': 'g', 'description': '..."
99,18160,"Cookies, chocolate chip, commercially prepared...",no Tags,no Manufacturer,Cookies,"[{'amount': 1, 'unit': 'cookie (average weight...","[{'value': 4.9, 'units': 'g', 'description': '..."
20,18037,Healthy Bread with oat bran and cream,no Tags,no Manufacturer,Sweet Bread,"[{'amount': 1, 'unit': 'oz', 'grams': 28.35}, ...","[{'value': 10.4, 'units': 'g', 'description': ..."
117,18186,"Cookies, peanut butter, commercially prepared,...",no Tags,no Manufacturer,Cookies,"[{'amount': 1, 'unit': 'oz', 'grams': 28.35}, ...","[{'value': 5.3, 'units': 'g', 'description': '..."
177,18270,"Hush puppies, prepared from recipe",no Tags,no Manufacturer,Sweet Bread,"[{'amount': 1, 'unit': 'oz', 'grams': 28.35}, ...","[{'value': 7.7, 'units': 'g', 'description': '..."
18,18035,Multi-grain bread with nutella,no Tags,no Manufacturer,Sweet Bread,"[{'amount': 1, 'unit': 'oz', 'grams': 28.35}, ...","[{'value': 13.36, 'units': 'g', 'description':..."


### Exercise 1b)

