In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.expected_conditions import element_to_be_clickable
import time

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
from PIL import Image
import pandas as pd
import os
import requests

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
# Dari 1 URL akan mengambil semua data makanan.

def get_food_data_by_url(url, data, driver=None):
    response = urlopen(url)

    object = soup(response.read())

    # Reuse WebDriver if provided, otherwise initialize a new instance
    if driver is None:
        driver = webdriver.Chrome()
        manage_driver = True  # Flag to quit driver later
    else:
        manage_driver = False
    
    driver.get(url)

    # Helper function for safe element retrieval
    def get_element_text(by, value, parent=None):
        try:
            element = (parent or driver).find_element(by, value)
            return element.get_attribute('innerHTML').strip()
        except NoSuchElementException:
            return None

    def wait_for_element(by, value, timeout=10):
        try:
            return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((by, value)))
        except TimeoutException:
            return None

    try:
        # Food name
        foodname = get_element_text(By.CLASS_NAME, 'recipe-title-container h1')

        # Serving size
        find_serving = driver.find_element(By.CLASS_NAME, 'recipe-cooking-info').find_elements(By.TAG_NAME, 'li')
        if len(find_serving) < 2:
            if "Serves" in find_serving[0].get_attribute('innerHTML').strip():
                serving = find_serving[0].find_element(By.TAG_NAME, 'span').get_attribute('innerHTML').strip()
            else:
                print("No serving size found")
                return data
        elif len(find_serving) < 3:
            if "Serves" in find_serving[1].get_attribute('innerHTML').strip():
                serving = find_serving[1].find_element(By.TAG_NAME, 'span').get_attribute('innerHTML').strip()
            else:
                print("No serving size found")
                return data    
        else:
            if "Serves" in find_serving[2].get_attribute('innerHTML').strip():
                serving = find_serving[2].find_element(By.TAG_NAME, 'span').get_attribute('innerHTML').strip()
            else:
                print("No serving size found")
                return data    

        # Image URL
        image = driver.find_element(By.CLASS_NAME, 'lead-image-block').find_element(By.TAG_NAME, 'img').get_attribute('src')

        # Ingredients
        try:
            ingredientlist = []

            for li in object.find('div',{'class':'col-sm-4 recipe-section recipe-ingredients-section'}).find('ul').findAll('li'):
                ingredient = li.find('div').get_text().strip()
                ingredientlist.append(ingredient)
        except Exception as e:
            print(f"Error retrieving ingredients: {e}")

        # Steps
        try:
            steps = []

            for li in object.find('div',{'class':'recipe-method-section'}).find('ul').findAll('li'):
                step = li.find('div',{'class':'recipe-method-step-content'}).get_text().strip()
                steps.append(step)
        except Exception as e:
            print(f"Error retrieving steps: {e}")
        


        # Check if there is nutritional button
        button = wait_for_element(By.CLASS_NAME, 'view-nutritional-info')
       
        if button is None:
            print(f"No nutritional button found for {url}")
            return data
        else:
            # Scroll into view and click
            driver.execute_script("arguments[0].scrollIntoView(true);", button)
            try:
                # Wait until button is clickable
                button = driver.find_element(By.CLASS_NAME,'show-details').find_element(By.CLASS_NAME,'view-nutritional-info')
                button.click()

                # clickable_button = WebDriverWait(driver, 10).until(element_to_be_clickable((By.CLASS_NAME, 'view-nutritional-info')))
                # # Use ActionChains for clicking
                # ActionChains(driver).move_to_element(clickable_button).click().perform()
                time.sleep(2)
            except Exception as e:
                print(f"Error clicking nutritional button: {e}")
                return data

        # Wait for nutritional sidebar to load
        wait_for_element(By.CLASS_NAME, 'nutrition-info-container')

        # Nutritional data
        nutrition_data = {}
        for key in ['energy', 'protein', 'fatTotal', 'carbTotal', 'sodium', 'dietaryFibre']:
            try:
                value_element = driver.find_element(By.CLASS_NAME, key).find_element(By.CLASS_NAME, 'value')
                nutrition_data[key] = value_element.get_attribute('innerHTML').strip()
            except NoSuchElementException:
                nutrition_data[key] = None  # Handle missing data gracefully

        # Build food dictionary
        food = {
            'url': url,
            'foodname': foodname,
            'serving': serving,
            'image': image,
            'ingredient': ingredientlist,
            'steps': steps,
            'calories': nutrition_data.get('energy'),
            'protein': nutrition_data.get('protein'),
            'fat': nutrition_data.get('fatTotal'),
            'carb': nutrition_data.get('carbTotal'),
            'sodium': nutrition_data.get('sodium'),
            'fibre': nutrition_data.get('dietaryFibre'),
        }

        data.append(food)

    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
        return data
    finally:
        # Close the driver only if it was created in this function
        if manage_driver:
            driver.quit()
        return data

    return data


In [None]:
# Dari 1 URL akan crawling untuk mendapatkan link makanan lainnya.
def crawl_url(url, urls, driver=None, depth=1):
    # Reuse WebDriver if provided, otherwise initialize a new instance
    if driver is None:
        options = Options()
        options.add_argument("--disable-blink-features")
        options.add_argument("--disable-blink-features=AutomationControlled")

        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        driver = webdriver.Chrome(options=options)
        manage_driver = True  # Flag to quit driver later
    else:
        manage_driver = False

    try:
        # Initialize the queue with the root URL
        queue = [url]
        print(f"Starting crawl with root URL: {url}")


        while queue and depth > 0:
            current_url = queue.pop(0)  # Get the next URL from the queue
            print(f"Accessing URL: {current_url}")
            
            try:
                driver.get(current_url)
                wait = WebDriverWait(driver, 10)  # 10-second timeout

                # Locate the recommendations container
                print("Waiting for recommendations container...")
                section = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'recommendations-container')))
                print("Recommendations container located.")

                # Collect all links on the current page
                print("Locating carousel items...")
                links = section.find_element(By.CLASS_NAME, 'flex-carousel') \
                    .find_element(By.CLASS_NAME, 'carousel-container') \
                    .find_element(By.TAG_NAME, 'div') \
                    .find_elements(By.CLASS_NAME, 'carousel-item')

                if not links:
                    print(f"No carousel items found for: {current_url}")
                    continue

                for link in links:
                    try:
                        href = link.find_element(By.CLASS_NAME, 'card-body') \
                            .find_element(By.TAG_NAME, 'a').get_attribute('href')
                        if href and href not in urls:  # Avoid duplicates
                            print(f"Link found: {href}" + "Number of Links Collected: " + str(len(urls))) 
                            urls.append(href)
                            queue.append(href)  # Add new link to the queue
                            
                           

                    except Exception as e:
                        print(f"Error retrieving link from {current_url}: {e}")
                        continue

            except Exception as e:
                print(f"Error accessing {current_url}: {e}")

            
            while len(queue) > 0:
             current_queue = queue.pop(0)
             if depth > 1:
                # Recursively crawl the link
                time.sleep(3)
                crawl_url(current_queue, urls, driver, depth-1)

    except Exception as e:
        print(f"An error occurred during crawling: {e}")

    finally:
        # Close the driver only if it was created in this function
        if manage_driver:
            driver.quit()


In [None]:
# Nomor URL index link makanan dari file csv
url_no =604

In [None]:
# Menyimpan data makanan hasil scraping ke dalam list bernama data
data = []

In [None]:
# Membuat driver khusus untuk meminimalisir deteksi crawling bot
def create_driver():
    options = Options()
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    
    # Inject headers using Chrome DevTools Protocol
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'Referer': 'https://www.google.com/',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': headers})
    return driver

In [None]:
# Menjalankan perintah untuk mencari data makanan berdasarkan link makanan yang ada pada csv urltaste.csv
# Membuat driver untuk inisialisasi crawling
driver = create_driver()

dataframe1 = pd.read_csv('E:/Big_Data_Analysis/urltaste.csv')

try:
    # Start processing URLs
    for url in dataframe1['urls'][url_no:]:
        try:
            print(f"Processing URL progress: {url_no+1}/{len(dataframe1['urls'])}")
            data = get_food_data_by_url(url, data, driver)
            time.sleep(2)
            url_no += 1
            
            # Refresh driver after every 40 iterations
            if url_no % 40 == 0:
                print("Refreshing driver after 40 iterations...")
                driver.quit()  # Close the current driver
                driver = create_driver()  # Create a new driver instance

        except Exception as e:
            print(f"An error occurred while processing {url}: {e}")
            break

finally:
    driver.quit()  # Ensure the driver is closed in all cases

Processing URL progress: 605/1095
Error retrieving ingredients: 'NoneType' object has no attribute 'get_text'
Processing URL progress: 606/1095
Processing URL progress: 607/1095
Processing URL progress: 608/1095
No serving size found
Processing URL progress: 609/1095
Processing URL progress: 610/1095
Processing URL progress: 611/1095
No serving size found
Processing URL progress: 612/1095
Processing URL progress: 613/1095
Processing URL progress: 614/1095
Processing URL progress: 615/1095
Processing URL progress: 616/1095
Processing URL progress: 617/1095
No nutritional button found for https://www.taste.com.au/recipes/muffin-pan-satay-chicken-roti-cups-recipe/tpxin1yu#coral_thread
Processing URL progress: 618/1095
Processing URL progress: 619/1095
Processing URL progress: 620/1095
Processing URL progress: 621/1095
Processing URL progress: 622/1095
No serving size found
Processing URL progress: 623/1095
Processing URL progress: 624/1095
Processing URL progress: 625/1095
Processing URL 

In [None]:
# Menyimpan data hasil scraping ke dalam dataframe
data_saved = data
len(data_saved)
dataframe2 = pd.DataFrame(data_saved)
# dataframe final 10
dataframe2.tail(10)

Unnamed: 0,url,foodname,serving,image,ingredient,steps,calories,protein,fat,carb,sodium,fibre
836,https://www.taste.com.au/recipes/one-pot-chick...,One-pot chicken spaghetti al limone,4,https://img.taste.com.au/WVMzwmV2/w720-h480-cf...,"[400g chicken breast fillets, 250g dried spagh...",[Fill a deep saucepan three-quarters of the wa...,3726 kj (891cal),37.4g,56.8g,54.0g,346.9mg,2.1g
837,https://www.taste.com.au/recipes/spinach-chees...,"Spinach, cheese &amp; tomato omelette",4,https://img.taste.com.au/b9svnxdS/w720-h480-cf...,"[2/3 cup cream, 20g butter, 4 small tomatoes, ...",[Preheat grill to high. Whisk together 2 eggs ...,1597 kj (382cal),19.8g,31.4g,5.5g,295.5mg,1.2g
838,https://www.taste.com.au/recipes/wholemeal-wra...,Wholemeal wraps with smoked ocean trout and eg...,4,https://img.taste.com.au/JuWsREQA/w720-h480-cf...,"[4 sheets wholemeal mountain bread, 100g smoke...",[Lay bread on a board and divide smoked ocean ...,1210 kj (289cal),,17g,17g,,
839,https://www.taste.com.au/recipes/roast-lamb-fo...,Roast lamb focaccia with tzatziki,4,https://img.taste.com.au/g4cIUm_Y/w720-h480-cf...,"[280g slices lamb roast, 4 slices focaccia, ha...",[Preheat grill to medium-high. Place focaccia ...,5444 kj (1301cal),76.9g,73.4g,83.9g,3.1g,12.7g
840,https://www.taste.com.au/recipes/pancake-shake...,Pancake shaker impossible zucchini slice,15,https://img.taste.com.au/3fAxuUhS/w720-h480-cf...,"[2 zucchini, coarsely grated, 4 eggs, 330ml (1...",[Preheat oven to 180C/160C fan forced. Grease ...,778 kj (186cal),6.4g,12.9g,11.1g,275.8mg,0.6g
841,https://www.taste.com.au/recipes/thai-beef-sal...,Thai beef salad,4,https://img.taste.com.au/N9zB_cHC/w720-h480-cf...,"[4 (about 200g each) scotch fillet steaks, 1 1...",[Preheat barbecue grill on high. Brush steaks ...,2716 kj (649cal),38.8g,47.8g,16.8g,606.4mg,2.6g
842,https://www.taste.com.au/recipes/carrot-cake-5...,Carrot cake,6,https://img.taste.com.au/3eWvBH1R/w720-h480-cf...,"[3 eggs, 175ml sunflower oil, 180g brown sugar...",[Preheat oven to 180°C. Grease and line the ba...,2974 kj (711cal),7.5g,35.1g,94.9g,430.2mg,3.1g
843,https://www.taste.com.au/recipes/gourmet-veget...,Gourmet vegetarian pizza,6,https://img.taste.com.au/Tck0p2Ae/w720-h480-cf...,"[1/4 cup (70g) basil pesto, 2 x 30cm pizza bas...",[Preheat oven to 230°C. Place two oven trays i...,2035 kj (486cal),17.3g,22.0g,55.3g,1.5g,2.5g
844,https://www.taste.com.au/recipes/macadamia-gol...,Macadamia &amp; golden syrup pies,6,https://img.taste.com.au/lHcHAJQe/w720-h480-cf...,"[100g unsalted butter, melted, 1/2 cup (125ml)...","[To make pastry, place zest, icing sugar, flou...",4597 kj (1099cal),13.6g,74.4g,101.0g,71mg,5.8g
845,https://www.taste.com.au/recipes/grilled-haris...,Grilled harissa zucchini on tabbouleh,4,https://img.taste.com.au/0xCrr0Bv/w720-h480-cf...,"[5 zucchini, halved lengthways, thickly sliced...",[Heat a lightly oiled chargrill or barbecue on...,1021 kj (244cal),9.9g,9.9g,33.8g,221.9mg,7.7g


In [None]:
# Menyimpan data hasil scraping ke dalam file csv
dataframe2.to_csv('tastescrapedata2.csv')