In [1]:
# Import libraries
import pandas as pd
import requests
import html
from bs4 import BeautifulSoup as bs

### Step 1: Create a soup object from the home page

In [2]:
# Create a url variable
url = 'https://pages.git.generalassemb.ly/rldaggie/for-scraping/'
# Use the requests library to get the html from the home page
res = requests.get(url)

In [3]:
# Check status code is good
res.status_code

200

### Step 2: Scrape the home page soup for every restaurant

Note: Your best bet is to create a list of dictionaries, one for each restaurant. Each dictionary contains the restaurant's name and path from the `href`. The result of your scrape should look something like this:

```python
restaurants = [
    {'name': 'A&W Restaurants', 'href': 'restaurants/1.html'}, 
    {'name': "Applebee's", 'href': 'restaurants/2.html'},
    ...
]
```

In [4]:
# Create a soup object from the html
soup = bs(res.content, 'lxml')

In [5]:
# Look for all elements that fulfill the requirements and drop the 'header' after that
element_list = soup.find_all('a', href=True)
element_list = element_list[1:]

# Create empty list to store our desired result
restaurants = []

# Loop through element list and append to our list the restaurant name and their url
for element in element_list:
    restaurants.append(dict(name = element.get_text(), href = element['href']))

# Display results
restaurants

[{'name': 'A&W Restaurants', 'href': 'restaurants/1.html'},
 {'name': "Applebee's", 'href': 'restaurants/2.html'},
 {'name': "Arby's", 'href': 'restaurants/3.html'},
 {'name': 'Atlanta Bread Company', 'href': 'restaurants/4.html'},
 {'name': "Bojangle's Famous Chicken 'n Biscuits",
  'href': 'restaurants/5.html'},
 {'name': 'Buffalo Wild Wings', 'href': 'restaurants/6.html'},
 {'name': 'Burger King', 'href': 'restaurants/7.html'},
 {'name': "Captain D's", 'href': 'restaurants/8.html'},
 {'name': "Carl's Jr.", 'href': 'restaurants/9.html'},
 {'name': "Charley's Grilled Subs", 'href': 'restaurants/10.html'},
 {'name': 'Chick-fil-A', 'href': 'restaurants/11.html'},
 {'name': "Chili's", 'href': 'restaurants/12.html'},
 {'name': 'Chipotle Mexican Grill', 'href': 'restaurants/13.html'},
 {'name': "Church's", 'href': 'restaurants/14.html'},
 {'name': 'Corner Bakery Cafe', 'href': 'restaurants/15.html'},
 {'name': 'Dairy Queen', 'href': 'restaurants/16.html'},
 {'name': "Denny's", 'href': 'res

### Step 3: Using the `href`, scrape each restaurant's page and create a single list of food dictionaries.

Your list of foods should look something like this:
```python
foods = [
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    {
        'calories': '0',
        'carbs': '0',
        'category': 'Drinks',
        'fat': '0',
        'name': 'A&W® Diet Root Beer',
        'restaurant': 'A&W Restaurants'
    },
    ...
]
```

**Note**: Remove extra white space from each category

In [6]:
# Create empty lists to store our desired result
foods = []

# Iterate through each restaurant and their respective url
for element in element_list:
    # Create a list to store all the foods for a given restaurant, this is reset to an empty list when we scrape for the next restaurant
    foods_list = []
    # Assign restaurant url to url_string
    url_string = element['href']
    # Concatenate the main page url with the restaurant url
    url_combined = url + url_string
    # 'get' using request method and store in 'res' the text data for a restaurant
    res = requests.get(url_combined)
    # Assign the name of the restaurant to restaurant_name
    restaurant_name = element.get_text()
    # If status code is 200, meaning that the request was successful, then the code will scrape the required data 
    if res.status_code == 200:
        # Create a beautiful soup object known as soup_restaurant
        soup_restaurant = bs(res.content, 'lxml')
        # Store the headers Name, Categories, Calories, Fat, Carbs in items for later access
        items = soup_restaurant.find_all(class_="table")
        # Store all food items in food_list
        food_items = soup_restaurant.find_all('td')
        # Loop through the 'headers' in items and store them in their own respective variable
        for item in items:
            name = item.find(string="Name").get_text()
            category = item.find(string="Category").get_text()
            calories = item.find(string="Calories").get_text()
            fat = item.find(string="Fat").get_text()
            carbs = item.find(string="Carbs").get_text()
            # Assign all 5 'haeders' to a tuple called tuple_list
            tuple_list = (name, category, calories, fat, carbs)
        # Loop through food_items and extract the text and append to this to foods_list
        for food in food_items:
            foods_list.append(food.get_text())
        # Loop through foods_list with a step of 5, this allows us to itereate through every 'group' of menu item with their respective nutritional information
        for i in range(0, len(foods_list), 5):
            # Loop through our 'group' of menu item and their respective nutritional information and store this in food_item
            food_item = foods_list[i:i + 5]
            # Append to our foods list a dictionary containing the 'header' for each menu item and their respective nutritional information and also the restaurant name
            foods.append(dict(zip(tuple_list, food_item), restaurant = restaurant_name))

    

In [7]:
# Display the first 5 rows for foods
foods[:5]

[{'Name': 'Original Bacon Double Cheeseburger',
  'Category': 'Burgers',
  'Calories': '760',
  'Fat': '45',
  'Carbs': '45',
  'restaurant': 'A&W Restaurants'},
 {'Name': 'Coney (Chili) Dog',
  'Category': 'Entrees',
  'Calories': '340',
  'Fat': '20',
  'Carbs': '26',
  'restaurant': 'A&W Restaurants'},
 {'Name': 'Chili Fries',
  'Category': 'French Fries',
  'Calories': '370',
  'Fat': '15',
  'Carbs': '49',
  'restaurant': 'A&W Restaurants'},
 {'Name': 'Strawberry Milkshake (small)',
  'Category': 'Shakes',
  'Calories': '670',
  'Fat': '29',
  'Carbs': '90',
  'restaurant': 'A&W Restaurants'},
 {'Name': 'A&W® Root Beer Freeze (large)',
  'Category': 'Shakes',
  'Calories': '820',
  'Fat': '18',
  'Carbs': '150',
  'restaurant': 'A&W Restaurants'}]

### Step 4: Create a pandas DataFrame from your list of foods

**Note**: Your DataFrame should have 5,131 rows

In [8]:
# Create the required dataframe
df = pd.DataFrame.from_dict(foods)
# Display the dataframe
df

Unnamed: 0,Name,Category,Calories,Fat,Carbs,restaurant
0,Original Bacon Double Cheeseburger,Burgers,760,45,45,A&W Restaurants
1,Coney (Chili) Dog,Entrees,340,20,26,A&W Restaurants
2,Chili Fries,French Fries,370,15,49,A&W Restaurants
3,Strawberry Milkshake (small),Shakes,670,29,90,A&W Restaurants
4,A&W® Root Beer Freeze (large),Shakes,820,18,150,A&W Restaurants
...,...,...,...,...,...,...
5126,Jr. Original Chocolate Frosty™,Shakes,200,5,32,Wendy's
5127,Grilled Chicken Go Wrap,Wraps,260,10,25,Wendy's
5128,Asiago Ranch Chicken Club,Sandwiches,670,32,57,Wendy's
5129,Spicy Chicken Go Wrap,Wraps,330,16,30,Wendy's


In [9]:
# Check for duplicates, I been told this is very important!
df[df.duplicated()]

Unnamed: 0,Name,Category,Calories,Fat,Carbs,restaurant
36,A&W® Diet Root Beer,Drinks,0,0,0,A&W Restaurants
72,A&W® Diet Root Beer,Drinks,0,0,0,A&W Restaurants
84,A&W® Diet Root Beer,Drinks,0,0,0,A&W Restaurants
96,A&W® Diet Root Beer,Drinks,0,0,0,A&W Restaurants
109,A&W® Diet Root Beer,Drinks,0,0,0,A&W Restaurants
...,...,...,...,...,...,...
4983,Beefy 5-Layer Burrito,Burritos,550,22,68,Taco Bell
4988,Express Taco Salad w/ Chips,Salads,580,29,59,Taco Bell
4989,Crispy Potato Soft Taco,Tacos,270,13,31,Taco Bell
5111,Value French Fries,French Fries,220,11,28,Wendy's


Huh the correct answer has duplicated rows, guess somebody doesn't check the answers....

In [10]:
# Drop duplicates
df.drop_duplicates(inplace=True)
# Display the dataframe
df

Unnamed: 0,Name,Category,Calories,Fat,Carbs,restaurant
0,Original Bacon Double Cheeseburger,Burgers,760,45,45,A&W Restaurants
1,Coney (Chili) Dog,Entrees,340,20,26,A&W Restaurants
2,Chili Fries,French Fries,370,15,49,A&W Restaurants
3,Strawberry Milkshake (small),Shakes,670,29,90,A&W Restaurants
4,A&W® Root Beer Freeze (large),Shakes,820,18,150,A&W Restaurants
...,...,...,...,...,...,...
5125,Spicy Chicken Sandwich,Sandwiches,510,20,55,Wendy's
5127,Grilled Chicken Go Wrap,Wraps,260,10,25,Wendy's
5128,Asiago Ranch Chicken Club,Sandwiches,670,32,57,Wendy's
5129,Spicy Chicken Go Wrap,Wraps,330,16,30,Wendy's


I have decided to remove the duplicated rows.

### Step 5: Export to csv

**Note:** Don't export the index column from your DataFrame

In [11]:
# Export to csv as df.csv
df.to_csv("df.csv", index = False)