In [1]:
import time 
from tqdm.notebook import tqdm  
from selenium import webdriver
from selenium.webdriver.common.by import By
import json
import pymongo
from dotenv import load_dotenv
import os 

load_dotenv()

In [8]:
options = webdriver.FirefoxOptions()
options.set_preference("browser.cache.disk.enable", False)
options.set_preference("browser.cache.memory.enable", False)
options.set_preference("browser.cache.offline.enable", False)
options.set_preference("network.http.use-cache", False)
browser = webdriver.Firefox(options=options)

browser.get('https://www.monin.com/us/recipes/drinks')

In [15]:
def find_drinks_on_page(browser, page_url, page_number): 
    drink_urls = [] 

    browser.get(f'{page_url}?page={page_number}')
    browser.implicitly_wait(1) 
    # time.sleep(1)   

    browser.execute_script("window.scrollBy(0, document.body.scrollHeight);")       
    browser.implicitly_wait(1) 
    # time.sleep(1)

    browser.execute_script("window.scrollBy(0, document.body.scrollHeight);") 
    browser.implicitly_wait(1)
    # time.sleep(1)
    
    for element in browser.find_elements(By.TAG_NAME, 'li'): 
        if element.get_attribute('class') != 'grid__item scroll-trigger animate--slide-in': 
            continue

        for drink in element.find_elements(By.TAG_NAME, 'a'): 
            candidate_url = drink.get_attribute('href')
            if 'https://monin.us/products/' in candidate_url:
                drink_urls.append(drink.get_attribute('href'))

    return drink_urls

def parse_pages(browser, pages_base_url, pages_number): 
    urls = [] 

    for page_number in tqdm(range(1, pages_number + 1)):
        page_urls = find_drinks_on_page(browser, pages_base_url, page_number)

        if len(page_urls) == 0: 
            print('Error occured in:', f'\n\t{pages_base_url}', f'\n\ton page: {page_number}')

        urls += page_urls

    return urls

In [18]:
pages = {
    'lemonade': [
        ('https://monin.us/collections/punches', 2),
        ('https://monin.us/collections/sodas', 9),
        ('https://monin.us/collections/batch-non-alcoholic-drinks', 2), 
        ('https://monin.us/collections/kid-friendly', 5),
    ], 
    'smoothies': [
        ('https://monin.us/collections/smoothies', 4),
        ('https://monin.us/collections/fruit-smoothies', 3)
    ],
     'cocoas_and_steamers': [
        ('https://monin.us/collections/cocoas-steamers', 3),
    ],
    'sugar_free': [
        ('https://monin.us/collections/sugar-free-drinks', 4),
    ],
    'punches': [
        ('https://monin.us/collections/punches', 2),
    ],
    'flawored_waters': [
        ('https://monin.us/collections/flavored-waters', 2),
    ],
    'teas': [
        ('https://monin.us/collections/tea-recipes', 7),
    ],
    'bubble_teas': [
        ('https://monin.us/collections/bubble-teas', 1),
    ],
    'sangrias': [
        ('https://monin.us/collections/sangrias', 4),
    ],
    'frappes_and_shakes': [
        ('https://monin.us/collections/frappes-shakes', 3),
    ],
    'cocktails': [
        ('https://monin.us/collections/low-abv-cocktail-recipes', 1),
        ('https://monin.us/collections/holiday-cocktails', 1),  
        ('https://monin.us/collections/beer-cocktails', 2),
        ('https://monin.us/collections/mocktail-recipes', 44),
        ('https://monin.us/collections/cocktails', 48),
        ('https://monin.us/collections/keto', 1),
        ('https://monin.us/collections/batch-cocktails', 1),
        ('https://monin.us/collections/trending-cocktail-recipes', 2)
    ],
    'coffee': [
        ('https://monin.us/collections/coffee-recipes', 24),
    ],
    'snow_cones': [
        ('https://monin.us/collections/snow-cones', 1),
    ]
}

In [None]:
drink_urls = {} 

for category in tqdm(pages): 
        for category_url, pages_number in pages[category]:      
                drink_urls[category] = drink_urls.get(category, []) + parse_pages(browser, category_url, pages_number)

In [4]:
# with open('drink_urls.json', 'w') as file:
#     json.dump(drink_urls, file)

In [9]:
drink_urls = dict(json.load(open('drink_urls.json', 'r')))

In [21]:
client = pymongo.MongoClient(os.getenv('MONGO_CONNECTION_STRING'))
db = client["monin"]
collection = db["drinks"]

In [None]:
drinks = [] 
parsed_urls = set([drink['drink_url'] for drink in collection.find(())])


for category in drink_urls.keys(): 
    urls = drink_urls[category]

    for url in tqdm(urls): 
        if url in parsed_urls:
                continue
        try:
            browser.get(url)
            browser.implicitly_wait(0.5)
            time.sleep(0.3)    

            for drink in browser.find_elements(By.CLASS_NAME, 'section-recipe'):       
                drink_image_url = drink.find_element(By.TAG_NAME, 'img').get_attribute('src')
                
                drink_data = drink.find_elements(By.CLASS_NAME, 'grid__item')[1]
                drink_title = drink_data.find_element(By.CLASS_NAME, 'product__title').text.strip()

                
                drink_recipie_steps = [el.text for el in drink_data.find_elements(By.TAG_NAME, 'li')]
                drink_parsed = {
                    'name': drink_title, 
                    'recipie': drink_recipie_steps,
                    'category': category, 
                    'image_url': drink_image_url, 
                    'drink_url': url,
                }
    
                drinks.append(drink_parsed)
                collection.insert_one(drink_parsed)              
                  
        except Exception as e:
            print('error', e)
            print('error', url)