# <u>Data Mining</u>



#### Imports

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import json
import time
import pandas as pd

After checking multiple websites and their recipes format we decided to scrape alllrecipes.com.

Function scraping a singular recipe page.</br>
The function receives url of a single recipe page and a selenium google driver.</br>
It returns a list of: recipe, date(list), rating, raters, time(overall time in minutes), categories(list), servings, ingredients(list), instructions(list), calories, fat, carbs, proteins.


In [2]:
def SingleRecipePageScrape(url, driver):
    # Setting up the driver
    try:
        driver.get(url)
        driver.maximize_window()
        
    except:
        print("Failed to load the page: " + url)
        return
    
    driver.implicitly_wait(5)

    # Scraping the data from a script containing scheme with the recipe's info
    try:
        html_content = driver.page_source
        soup = BeautifulSoup(html_content,'html.parser')
        data_dict = json.loads(soup.find("script", {"id" : "allrecipes-schema_1-0"}).text)[0]
    except:
        driver.close()
        return
    
    # Name
    try:
        recipe = data_dict["name"]
    except:
        recipe = None

    # Date
    try:
        date_str = data_dict["datePublished"]
        date_list = []
        date_list.append(int(date_str[:4]))
        date_list.append(int(date_str[5:7]))
    except:
        date_list = None

    # Rating
    try:
        rating = float(data_dict["aggregateRating"]["ratingValue"])
    except:
        rating = None

    # Raters
    try:
        raters = int(data_dict["aggregateRating"]["ratingCount"])
    except:
        raters = None

    # Time in minutes
    try:
        time_cook = int(data_dict["totalTime"][2:-1])
    except:
        time_cook = None

    # Category
    try:
        category_list = data_dict["recipeCategory"]
    except:
        category_list = None

    # Servings
    try:
        servings = int(data_dict["recipeYield"][0])
    except:
        servings = None

    # Ingredients, using the html file itself
    try:
        ingredients_spans = soup.find_all("span", {"data-ingredient-name" : "true"})
        ingredients_list = []
        for span in ingredients_spans:
            ingredients_list.append(span.text)
    except:
        ingredients_list = None

    # Instructions
    try:
        instructions = data_dict["recipeInstructions"]
        instructions_list = []
        for ins in instructions:
            instructions_list.append(ins["text"])
    except:
        instructions_list = None
        
    # Nutrition, using the html file itself
    try:
        nutrition_td = soup.find_all("td",{"class": "mntl-nutrition-facts-summary__table-cell type--dog-bold"})
        nutrition_list = []
        for nutrition in nutrition_td:
            nutrition_list.append(re.sub(r'[^0-9.]','',nutrition.text))
    
        calories = float(nutrition_list[0])
        fat = float(nutrition_list[1])
        carbs = float(nutrition_list[2])
        proteins = float(nutrition_list[3])
    except:
        calories = None
        fat = None
        carbs = None
        proteins = None

    values = [recipe, date_list, rating, raters, time_cook, category_list, servings, ingredients_list, instructions_list, calories, fat, carbs, proteins]
    return values



Function scraping a topic page with multiple recipes in it.</br>
The function receives a url of topic page and selenium google driver</br>
It returns list of recipes urls in the topic page.

In [3]:
def MultipleRecipesPageScrape(url, driver):
    # Setting up the driver
    try:
        driver.get(url)
        driver.maximize_window()
        
    except:
        print("Failed to load the multiple recipes page: " + url)
        return

    driver.implicitly_wait(5)

    # Scraping the recipes from topic page
    try:
        html_content = driver.page_source
        soup = BeautifulSoup(html_content,'html.parser')
        data_list = json.loads(soup.find("script", {"id" : "allrecipes-schema_1-0"}).text)[0]["itemListElement"]
        recipes_links = []
    except:
        driver.close()
        return
    for item in data_list:
        try:
            recipes_links.append(item["url"])
        except:
            continue
    return recipes_links
    

Function scraping the allrecipes website index.</br>
The function recives google selenium driver.</br>
It returns the list of all topics in the index page.

In [4]:
def TopicPageScrape(driver):
     # Setting up the driver
    try:
        driver.get("https://www.allrecipes.com/recipes-a-z-6735880")
        driver.maximize_window()   
    except:
        print("Failed to load the topics page")
        return
    
    # Getting the topics from the index page
    html_content = driver.page_source
    soup = BeautifulSoup(html_content,'html.parser')
    links_element = soup.findAll("a", {"class" : "link-list__link type--dog-bold type--dog-link"})
    links_list = []
    for element in links_element:
        links_list.append(element["href"])

    return links_list
    



Activating our functions.</br>
Scrape the index page for topics and add them to a list (output of TopicPageScrape)</br>
then for each topic scrape its page and add all the recipes list (output of MultipleRecipesPageScrape) to a new list.

In [5]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
service = Service('C:\SeleniumDrivers')
driver = webdriver.Chrome(service = service, options = options)

topic_list = TopicPageScrape(driver)
recipe_list_of_lists = []
for topic in topic_list:
    recipe_list_of_lists.append(MultipleRecipesPageScrape(topic, driver))
driver.close()

For each list of recipes in the recipe_list_of_lists, activate SingleRecipePageScrape, and add them to the DataFrame's list of values.</br> and create a DataFrame from it.

In [6]:
data_list = []
for recipe_list in recipe_list_of_lists:
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    service = Service('C:\SeleniumDrivers')
    driver = webdriver.Chrome(service = service, options = options)
    for recipe in recipe_list:
        try:
            data_list.append(SingleRecipePageScrape(recipe, driver))
        except:
            continue
    driver.close()


Removing None Rows, we may get if the scraping fails.

In [7]:
for lst in data_list:
    if lst == None:
        data_list.remove(lst)

In [8]:
recipe_df = pd.DataFrame(data_list)
recipe_df.columns = ["Recipe", "Date", "Rating", "Number of Raters", "Time", "Categories", "Servings", "Ingredients", "Instructions", "Calories", "Fat", "Carbs","Proteins"]


In [10]:
recipe_df.to_csv('RecipeData.csv')