In [None]:
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup, Comment
import requests
import time

nest_asyncio.apply()

google_api_key = "your_google_api_key_here"  # Replace with your actual API key

async def scrape_website(url, next_button_class, element_to_check):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        all_reviews = []

        while True:
            await page.wait_for_selector('body')
            page_source = await page.content()

            # Extract reviews from the current page
            reviews = extract_reviews_with_selenium(page_source)
            if reviews:
                all_reviews.extend(reviews)
            else:
                print("No reviews found on this page.")

            try:
                next_button = page.locator(f'.{next_button_class}')
                if await next_button.count() > 0:
                    await next_button.click()
                    await page.wait_for_load_state('networkidle')  # Wait until the next page is loaded
                    await page.wait_for_selector(element_to_check)  # Wait for specific element to be visible
                else:
                    print("No more pages.")
                    break
            except Exception as e:
                print(f"An error occurred: {e}")
                break

        await browser.close()
        return all_reviews


def extract_reviews_with_selenium(html_content):
    # Set up Selenium WebDriver with WebDriver Manager
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # Load the HTML content in Selenium
        driver.get("data:text/html;charset=utf-8," + html_content)
        time.sleep(2)  # Ensure the content loads

        # Extract and clean HTML with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for tag in soup(['script', 'style', 'meta', 'link']):
            tag.decompose()

        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        cleaned_html = str(soup)

        # Use the Google API to extract reviews
        response = requests.post(
            url=f"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?key={google_api_key}",
            headers={
                "Content-Type": "application/json"
            },
            json={
                "contents": [
                    {
                        "parts": [
                            {
                                'text': "extract all the reviews in the page and return in json format, if no reviews are found, return NULL " + cleaned_html
                            }
                        ]
                    }
                ]
            }
        )
        driver.quit()

        if response.status_code == 200:
            data = response.json()
            reviews = data['candidates'][0]['content']['parts'][0]["text"]
            return reviews if reviews != "NULL" else []
        else:
            print("Error with Google API:", response.status_code, response.text)
            return []

    except Exception as e:
        print(f"An error occurred: {e}")
        driver.quit()
        return []


async def main():
    all_reviews = await scrape_website(
        url='https://2717recovery.com/products/recovery-cream',
        next_button_class='jdgm-paginate__next-page',
        element_to_check='body'
    )

    # Save all reviews to a file
    with open("all_reviews.json", "w") as f:
        f.write("\n".join(all_reviews))

    print("Reviews extracted:", all_reviews)


asyncio.run(main())