## Exercise
The exercise is on analyzing the nutrient contents on the different items on the mcdonalds menu. The exercise may help to identify the mcdonalds menu items that may be potentially harmful to health and may help the users make better choices about their food when at mcdonalds.

In [2]:
import pandas as pd
import altair as alt
from umap import UMAP
import umap

## Dataset
The dataset has been obtained from https://www.kaggle.com/mcdonalds/nutrition-facts

In [3]:
input_file = "../../data/datasets_910_1662_menu.csv"
data = pd.read_csv(input_file)
print(data.shape)
data.head()

(260, 24)


Unnamed: 0,Category,Item,Serving Size,Calories,Calories from Fat,Total Fat,Total Fat (% Daily Value),Saturated Fat,Saturated Fat (% Daily Value),Trans Fat,...,Carbohydrates,Carbohydrates (% Daily Value),Dietary Fiber,Dietary Fiber (% Daily Value),Sugars,Protein,Vitamin A (% Daily Value),Vitamin C (% Daily Value),Calcium (% Daily Value),Iron (% Daily Value)
0,Breakfast,Egg McMuffin,4.8 oz (136 g),300,120,13.0,20,5.0,25,0.0,...,31,10,4,17,3,17,10,0,25,15
1,Breakfast,Egg White Delight,4.8 oz (135 g),250,70,8.0,12,3.0,15,0.0,...,30,10,4,17,3,18,6,0,25,8
2,Breakfast,Sausage McMuffin,3.9 oz (111 g),370,200,23.0,35,8.0,42,0.0,...,29,10,4,17,2,14,8,0,25,10
3,Breakfast,Sausage McMuffin with Egg,5.7 oz (161 g),450,250,28.0,43,10.0,52,0.0,...,30,10,4,17,2,21,15,0,30,15
4,Breakfast,Sausage McMuffin with Egg Whites,5.7 oz (161 g),400,210,23.0,35,8.0,42,0.0,...,30,10,4,17,2,21,6,0,25,10


In [10]:
data.columns.tolist()

['Category',
 'Item',
 'Serving Size',
 'Calories',
 'Calories from Fat',
 'Total Fat',
 'Total Fat (% Daily Value)',
 'Saturated Fat',
 'Saturated Fat (% Daily Value)',
 'Trans Fat',
 'Cholesterol',
 'Cholesterol (% Daily Value)',
 'Sodium',
 'Sodium (% Daily Value)',
 'Carbohydrates',
 'Carbohydrates (% Daily Value)',
 'Dietary Fiber',
 'Dietary Fiber (% Daily Value)',
 'Sugars',
 'Protein',
 'Vitamin A (% Daily Value)',
 'Vitamin C (% Daily Value)',
 'Calcium (% Daily Value)',
 'Iron (% Daily Value)']

## Calorie distribution

In [24]:
dist_col = "Calories"
bin_len = 50
height=360
width=640

alt_bin = alt.Bin(step=bin_len)
chart = alt.Chart(data).mark_bar().encode(x=alt.X("%s:Q" % (dist_col), bin=alt_bin),y='count()')
chart = chart.properties(height=height,width=width).interactive()

chart.display()

In [25]:
dist_col = "Calories"
color_col = "Category"
bin_len = 50
height=360
width=640

alt_bin = alt.Bin(step=bin_len)
color=alt.Color("%s:N" % (color_col))
chart = alt.Chart(data).mark_area(opacity=0.7,interpolate='step').encode(x=alt.X("%s:Q" % (dist_col), bin=alt_bin),y='count()',color=color)
chart = chart.properties(height=height,width=width).interactive()

chart.display()

## Daily Calorie intake note
Recommended Daily Intake of calories for a health average individual is 2000cal
https://health.gov/our-work/food-nutrition/2015-2020-dietary-guidelines/guidelines/appendix-2/


In [26]:
category_df = data[["Category","Calories"]].groupby("Category").agg({"Calories":"median"})
category_df = category_df.reset_index()
category_df.columns = ["Category","Median_Calories"]
category_df = category_df.assign(Daily_Intake_Proportion=category_df["Median_Calories"]*100/2000)
category_df = category_df.sort_values("Median_Calories",ascending=False)
# print(category_df.head())

height=360
width=640
x_col = "Median_Calories"
x_label = "Calories (Median in each category)"
y_col = "Category"
color_col = x_col
x_alt = alt.X('%s:Q' % (x_col),title=x_label)
y_alt = alt.Y("%s:N" % (y_col),sort="-x")
color = alt.Color(color_col)
chart = alt.Chart(category_df).mark_bar().encode(x=x_alt,y=y_alt,color=color)
chart = chart.properties(height=height,width=width).interactive()

chart.display()

In [27]:
category_df = category_df.sort_values("Daily_Intake_Proportion",ascending=False)

height=360
width=640
x_col = "Daily_Intake_Proportion"
x_label = "Proportion of Recommended Daily Intake Calories (in %)"
y_col = "Category"
color_col = x_col
x_alt = alt.X('%s:Q' % (x_col),title=x_label)
y_alt = alt.Y("%s:N" % (y_col),sort="-x")
color = alt.Color(color_col)
chart = alt.Chart(category_df).mark_bar().encode(x=x_alt,y=y_alt,color=color)
chart = chart.properties(height=height,width=width).interactive()

chart.display()

## Sugar distribution

In [28]:
dist_col = "Sugars"
bin_len = 5
height=360
width=640

alt_bin = alt.Bin(step=bin_len)
chart = alt.Chart(data).mark_bar().encode(x=alt.X("%s:Q" % (dist_col), bin=alt_bin),y='count()')
chart = chart.properties(height=height,width=width).interactive()

chart.display()

In [29]:
dist_col = "Sugars"
color_col = "Category"
bin_len = 5
height=360
width=640

alt_bin = alt.Bin(step=bin_len)
color=alt.Color("%s:N" % (color_col))
x_alt = alt.X("%s:Q" % (dist_col), bin=alt_bin,title="Qty of Sugar(in grams)")
chart = alt.Chart(data).mark_area(opacity=0.7,interpolate='step').encode(x=x_alt,y='count()',color=color)
chart = chart.properties(height=height,width=width).interactive()

chart.display()

## Sugar recommendation note
Recommended Max. Daily Intake of sugars for a health average individual is 30 gm
https://www.nhs.uk/live-well/eat-well/how-does-sugar-in-our-diet-affect-our-health/


In [32]:
category_df = data[["Category","Sugars"]].groupby("Category").agg({"Sugars":"median"})
category_df = category_df.reset_index()
category_df.columns = ["Category","Median_Sugar_Qty"]
category_df = category_df.assign(Daily_Max_Intake_Prop=category_df["Median_Sugar_Qty"]*100/30)
category_df = category_df.sort_values("Median_Sugar_Qty",ascending=False)
# print(category_df.head())

height=360
width=640
x_col = "Median_Sugar_Qty"
x_label = "Qty of Sugar (in grams)"
y_col = "Category"
color_col = x_col
x_alt = alt.X('%s:Q' % (x_col),title=x_label)
y_alt = alt.Y("%s:N" % (y_col),sort="-x")
color = alt.Color(color_col)
chart = alt.Chart(category_df).mark_bar().encode(x=x_alt,y=y_alt,color=color)
chart = chart.properties(height=height,width=width).interactive()

chart.display()

In [33]:
category_df = category_df.sort_values("Daily_Max_Intake_Prop",ascending=False)

height=360
width=640
x_col = "Daily_Max_Intake_Prop"
x_label = "Proportion of Max Daily Intake Sugar (in %)"
y_col = "Category"
color_col = x_col
x_alt = alt.X('%s:Q' % (x_col),title=x_label)
y_alt = alt.Y("%s:N" % (y_col),sort="-x")
color = alt.Color(color_col)
chart = alt.Chart(category_df).mark_bar().encode(x=x_alt,y=y_alt,color=color)
chart = chart.properties(height=height,width=width).interactive()

chart.display()

## Mapping of nutrient distribution

In [5]:
selected_cols = ["Item","Total Fat","Cholesterol","Sodium","Carbohydrates","Dietary Fiber",
                 "Sugars","Protein"]
nutrient_df = data[selected_cols]
nutrient_df.shape


(260, 8)

In [20]:
umap_df = nutrient_df.drop(["Item"],axis=1)
n_neighbors = 15
min_dist=0.001
metric="cosine"
n_components=2
fit = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,metric=metric,n_components=n_components)
umap_fit = fit.fit_transform(umap_df.to_numpy())
umap_df = pd.DataFrame(umap_fit,columns=["X","Y"])
umap_df = pd.concat([umap_df,nutrient_df],axis=1)
umap_df.head()

Unnamed: 0,X,Y,Item,Total Fat,Cholesterol,Sodium,Carbohydrates,Dietary Fiber,Sugars,Protein
0,7.827578,1.033717,Egg McMuffin,13.0,260,750,31,4,3,17
1,0.527937,-3.570032,Egg White Delight,8.0,25,770,30,4,3,18
2,1.82361,-3.383734,Sausage McMuffin,23.0,45,780,29,4,2,14
3,7.867712,1.015148,Sausage McMuffin with Egg,28.0,285,860,30,4,2,21
4,1.857081,-3.478949,Sausage McMuffin with Egg Whites,23.0,50,880,30,4,2,21


In [21]:
height=360
width=640
x_col = "X"
x_label = "Dimension 1"
y_col = "Y"
y_label = "Dimension 2"
title = "Dimension Reduction with UMAP"
x_alt = alt.X('%s:Q' % (x_col),title=x_label)
y_alt = alt.Y("%s:Q" % (y_col),title=y_label)
tooltip_cols = umap_df.drop(["X","Y"],axis=1).columns.tolist()
chart = alt.Chart(umap_df).mark_point().encode(x=x_alt,y=y_alt,tooltip=tooltip_cols)
chart = chart.properties(height=height,width=width,title=title).interactive()

output_file = "../../img/menu_umap_scatter.html"

chart.save(output_file)
chart.display()

In [22]:
umap_df = nutrient_df.drop(["Item"],axis=1)
umap_df = umap_df.apply(lambda x: (x - x.mean())/x.std())
n_neighbors = 15
min_dist=0.001
metric="euclidean"
n_components=2
fit = umap.UMAP(n_neighbors=n_neighbors,min_dist=min_dist,metric=metric,n_components=n_components)
umap_fit = fit.fit_transform(umap_df.to_numpy())
umap_df = pd.DataFrame(umap_fit,columns=["X","Y"])
umap_df = pd.concat([umap_df,nutrient_df],axis=1)
umap_df.head()

Unnamed: 0,X,Y,Item,Total Fat,Cholesterol,Sodium,Carbohydrates,Dietary Fiber,Sugars,Protein
0,11.041071,10.777412,Egg McMuffin,13.0,260,750,31,4,3,17
1,7.801731,5.380869,Egg White Delight,8.0,25,770,30,4,3,18
2,7.729157,5.230566,Sausage McMuffin,23.0,45,780,29,4,2,14
3,11.047947,10.739624,Sausage McMuffin with Egg,28.0,285,860,30,4,2,21
4,7.844788,5.287067,Sausage McMuffin with Egg Whites,23.0,50,880,30,4,2,21


In [23]:
height=360
width=640
title="Normalized Dimension Reduction with UMAP"
x_col = "X"
x_label = "Dimension 1"
y_col = "Y"
y_label = "Dimension 2"
x_alt = alt.X('%s:Q' % (x_col),title=x_label)
y_alt = alt.Y("%s:Q" % (y_col),title=y_label)
tooltip_cols = umap_df.drop(["X","Y"],axis=1).columns.tolist()
chart = alt.Chart(umap_df).mark_point().encode(x=x_alt,y=y_alt,tooltip=tooltip_cols)
chart = chart.properties(height=height,width=width,title=title).interactive()
output_file = "../../img/norm_menu_euclid_umap.html"

chart.save(output_file)
chart.display()