# ETL Process for Macdonald Nutrients data
This a ETL process to scrap the website of Macdonald and extract the food nutrients data.


# Extract (E) Step
For this step we are going to use Scraping method because the data is locate in a website.

In [160]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sqlalchemy import create_engine
import os

In [103]:
BASE_URL = "https://www.mcdonalds.com"
MENU_URL = "https://www.mcdonalds.com/us/en-us/full-menu.html"

response = requests.get(MENU_URL)

soup = BeautifulSoup(response.content, "html.parser")

foods_content = soup.find(id='maincategorycontent')

# get categories of food
categories_divs = foods_content.find_all("div", class_="productListing")

# Removing combos categories
for index in [5, 7]: del categories_divs[index]


In [36]:
# Understanding the structure of the food category and Items content

test_category_div = categories_divs[1] # Second Category div
# print(test_category_div)

# Getting Category Title
test_category_title = test_category_div.find("h2", class_="mcd-category-page__sub-heading").text
#print(test_category_title)

# Getting food list items
test_food_items = test_category_div.find_all('li', class_="mcd-category-page__item")
#print(test_food_items)

test_food_item = test_food_items[20] # First Food List item
#print(test_food_item)

# Getting data-at attribute that contains the id of the food
test_data_attr = test_food_item.find('a', class_="mcd-category-page__item-link")['data-at']

test_food_id = test_data_attr.split(":")[-2] # Cleaning the string to get the id of the food
print(test_food_id)

203057


In [104]:
# This blocks of codes if for getting food ids

def get_category_title(category_div):
    category_title = category_div.find("h2", class_="mcd-category-page__sub-heading").text
    return category_title


def get_category_foods(category_div):
    # Getting food list items
    food_items = category_div.find_all('li', class_="mcd-category-page__item")
    return food_items


def get_food_id(food_li):
    # Getting data-at attribute that contains the id of the food
    data_attr = food_li.find('a', class_="mcd-category-page__item-link")['data-at']
    
    # Detecting valid value to apply the cleaning
    if ":" in data_attr:
        food_id = data_attr.split(":")[-2] # Cleaning the string to get the id of the food
        return food_id

food_ids_list = []

for category_div in categories_divs:
    category_foods = get_category_foods(category_div)

    for food_li in category_foods:
        food_id = get_food_id(food_li)
        # When the data is not valid return None, that is why we verify is return None to not add it to the list.
        if food_id != None:
            food_ids_list.append(food_id)

food_ids_list = set(food_ids_list) # Removing repeated data
food_ids_list = list(food_ids_list)

In [None]:
print(food_ids_list)

## Getting Food Details data
The nutrients data in the Details page is generated with javascript. In this case we can use a framework like selenium to generate the javascript codes, but there is another easier posible solutions. This solution is verifying if the page use a API to catch that data. In this case the dare is catch in a json format and it is fetch using this url format. In the item attribute is where we are going to put the id of each product to fetch the detail data about the food.

https://www.mcdonalds.com/wws/json/getItemDetails.htm?country=US&language=en&showLiveData=true&item=200301

In [105]:
# Getting all foods json data
def fetch_food_json_data(food_id):
    url = f"https://www.mcdonalds.com/wws/json/getItemDetails.htm?country=US&language=en&showLiveData=true&item={food_id}"
    json_data = requests.get(url).json()
    return json_data

food_json_list = []

for food_id in food_ids_list:
    json_data = fetch_food_json_data(food_id)
    food_json_list.append(json_data)

In [84]:
food_json_list[0]['item']['nutrient_facts']['nutrient'][0]

{'adult_dv': '',
 'child_dv': {},
 'hundred_g_per_product': '86.2',
 'id': 2,
 'name': 'Calories',
 'nutrient_name_id': 'calories',
 'uom': 'Cal.',
 'uom_description': 'Cal.',
 'value': '210',
 'woman_dv': {}}

# Transform (T) Step
Righ now we have a lot of data of each food or product, but fot this ETL we need a few of data about Nutrients. In this step we are going to get the necesary data from the json and give it a correspondent strtucture.

The data that we are going to collet are:
* Calories (Cal.)
* Total Fat (g)
* Total Carbohydrates (g)
* Protein (g)
* Saturated Fat (g)
* Dietary Fiber (g)
* Calcium (mg)
* Trans Fat (g)
* Total Sugars (g)
* Iron (mg)
* Cholesterol (mg)
* Vitamin D (mcg)
* Potassium (mg)
* Sodium (mg)
* phosphorus (mg)


In [172]:
def get_nuntrient_value(nutrient_facts_json, nutrient_id):
    nutrients = nutrient_facts_json['nutrient']
    
    for nutrient in nutrients:
        if nutrient['nutrient_name_id'] == nutrient_id:
            return float(nutrient['value']) # Converting value to float
        

def structure_food_data(json_data):
    nutrient_facts = json_data['nutrient_facts']
    
    # like 5 food item json do not offers nutrient_facts data, so we verify that to not appended to the list
    # And like 1 food item json do not offers category data, so we remove it
    if len(nutrient_facts) != 0 and len(json_data['default_category']) != 0:
        data = {
            "food_name": json_data['item_name'],
            "category": json_data['default_category']['category']['name'],
            "calories_cal": get_nuntrient_value(nutrient_facts, 'calories'),
            "total_fat_g": get_nuntrient_value(nutrient_facts, 'fat'),
            "total_carbohydrates_g": get_nuntrient_value(nutrient_facts, 'carbohydrate'),
            "protein_g": get_nuntrient_value(nutrient_facts, 'protein'),
            "saturated_fat_g": get_nuntrient_value(nutrient_facts, 'saturated_fat'),
            "dietary_fiber_g": get_nuntrient_value(nutrient_facts, 'fibre'),
            "calcium": get_nuntrient_value(nutrient_facts, 'calcium'),
            "trans_fat": get_nuntrient_value(nutrient_facts, 'trans_fat'),
            "total_sugars": get_nuntrient_value(nutrient_facts, 'sugars'),
            "iron": get_nuntrient_value(nutrient_facts, 'iron'),
            "cholesterol": get_nuntrient_value(nutrient_facts, 'cholesterol'),
            "vitaminD": get_nuntrient_value(nutrient_facts, 'vitaminD'),
            "potassium": get_nuntrient_value(nutrient_facts, 'potassium'),
            "sodium": get_nuntrient_value(nutrient_facts, 'sodium'),
            "phosphorus": get_nuntrient_value(nutrient_facts, 'phosphorus')
        }

        return data



In [173]:
transformed_jsons_list = []

for food_json in food_json_list:
    data = structure_food_data(food_json['item'])
    if data != None:
        transformed_jsons_list.append(data)

In [None]:
transformed_jsons_list

# Load (L) Step
In this step we are going to load the data to a Postgresql Database hosted in heroku.

In [157]:
from dotenv import load_dotenv
load_dotenv()

True

In [176]:
DB_URL = os.environ.get('DATABASE_URL')

df = pd.DataFrame(transformed_jsons_list)

In [None]:
df

In [179]:
engine = create_engine(DB_URL, echo=False)
df.to_sql('macdonald_nutrients', con=engine, if_exists="replace")

In [None]:
# Example

sql = "SELECT * FROM macdonald_nutrients"
engine.execute(sql).fetchall()

In [188]:
# Example of a Analysis data
sql = "SELECT food_name FROM macdonald_nutrients WHERE total_sugars > 36"
engine.execute(sql).fetchall()

[('Big Breakfast with Hotcakes',),
 ('Vanilla Shake (Small)',),
 ('Caramel Macchiato (Small)',),
 ('Hotcakes and Sausage',),
 ('Hot Caramel Sundae',),
 ('Mango Pineapple Smoothie (Small)',),
 ('Hotcakes',),
 ('Frappe Mocha (Small)',),
 ('McFlurry with Oreo Cookies',),
 ("McFlurry with M&M'S Chocolate Candies",),
 ('Chocolate Shake (Small)',),
 ('Hot Fudge Sundae',),
 ('Mocha Caramel (Small)',),
 ('Strawberry Shake (Small)',),
 ('Strawberry Banana Smoothie (Small)',),
 ('Hot Chocolate (Small)',),
 ('Fanta® Orange (Small)',),
 ('Frappe Caramel (Small)',),
 ('Dr Pepper (Small)',),
 ('Cinnamon Roll',),
 ('Coca-Cola (Small)',)]

The results above are the foods that over pass the recommended quantity of sugar daily.
The American Heart Association recommends that people eat no more than 36 grams of sugar for most men and 24 grams for most women per day.