In [39]:
# !pip install requests bs4 selenium time random

In [40]:
import requests
from bs4 import BeautifulSoup
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    StaleElementReferenceException,
)
from selenium.webdriver.common.action_chains import ActionChains
import time
import random

In [41]:
def fetch_css_selectors(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    css_selectors = []

    def extract_selectors(element):
        tag = element.name
        classes = element.get("class")
        class_selector = f".{'.'.join(classes)}" if classes else None
        id_selector = f"#{element.get('id')}" if element.get("id") else None

        if class_selector:
            css_selectors.append(class_selector)
        if id_selector:
            css_selectors.append(id_selector)
        if tag:
            css_selectors.append(tag)

        for child in element.children:
            if isinstance(child, BeautifulSoup) or isinstance(child, str):
                continue
            extract_selectors(child)

    extract_selectors(soup)
    return list(set(css_selectors))

In [42]:
def query_ollama(prompt):
    """Send a query to local Ollama server"""
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": "llama3.2",
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        return response.json()['response']
    except requests.exceptions.RequestException as e:
        raise Exception(f"Error communicating with Ollama: {str(e)}")
    
def extract_json_from_text(text):
    """Extract the first JSON object from text"""
    try:
        # First try to parse the entire text as JSON
        return json.loads(text)
    except json.JSONDecodeError:
        # If that fails, try to find JSON object in the text
        try:
            # Find the first occurrence of a JSON-like structure
            start_idx = text.find('{')
            end_idx = text.rfind('}') + 1
            if start_idx != -1 and end_idx != -1:
                json_str = text[start_idx:end_idx]
                return json.loads(json_str)
        except (json.JSONDecodeError, ValueError):
            # If still no valid JSON, try cleaning the text
            cleaned_text = re.sub(r'```json|```', '', text)  # Remove markdown code blocks
            cleaned_text = cleaned_text.strip()
            try:
                return json.loads(cleaned_text)
            except json.JSONDecodeError:
                raise Exception("Could not extract valid JSON from response")

def get_tag_suggestions(css_selectors):
    # Create a prompt that asks the model to identify relevant selectors
    prompt = f"""Given these CSS selectors from a website:
{json.dumps(css_selectors, indent=2)}

Analyze them and identify the most likely selectors for:
1. Review text container
2. Author name
3. Rating element
4. Next page button

Common patterns to look for:
- Review containers often have classes/IDs with words like 'review', 'comment', 'text', 'content'
- Author elements often contain 'author', 'user', 'name', 'by'
- Rating elements usually have 'rating', 'stars', 'score'
- Next page buttons typically include 'next', 'pagination', 'nav'

Return ONLY a JSON object in this exact format (no other text):
{{
    "review_tag": "selector_for_review_container",
    "author_tag": "selector_for_author_name",
    "rating_tag": "selector_for_rating",
    "next_page_button_tag": "selector_for_next_button"
}}"""

    # Get response from Ollama
    response = query_ollama(prompt)
    
    return extract_json_from_text(response)

In [43]:
def handle_dynamic_popups(driver):
    try:
        potential_popups = driver.find_elements(By.CSS_SELECTOR, "*")
        for element in potential_popups:
            try:
                style = element.get_attribute("style")
                if "z-index" in style and "visibility: visible" in style:
                    close_buttons = element.find_elements(By.CSS_SELECTOR, "button, a, span")
                    for button in close_buttons:
                        if button.is_displayed() and button.is_enabled():
                            button.click()
                            time.sleep(1)
                            return True
                    action = ActionChains(driver)
                    random_x = random.randint(0, driver.execute_script("return window.innerWidth;"))
                    random_y = random.randint(0, driver.execute_script("return window.innerHeight;"))
                    action.move_by_offset(random_x, random_y).click().perform()
                    time.sleep(1)
                    return True
            except StaleElementReferenceException:
                continue
            except Exception:
                continue
    except Exception:
        pass
    return False

In [44]:
def fetch_all_reviews(url, review_tag, author_tag, rating_tag, next_page_button_tag):
    # Set up Selenium WebDriver
    service = Service("chromedriver.exe")  # Update this path
    driver = webdriver.Chrome(service=service)
    driver.maximize_window()  # Maximize window for better visibility

    try:
        # Open the webpage
        driver.get(url)
        wait = WebDriverWait(driver, 10)  # Wait for elements to load

        all_reviews = []
        page_number = 1  # Track the current page number for debugging

        while True:
            try:
                # Wait for the reviews section to load
                wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, review_tag)))

                # Get the current page source and parse it with BeautifulSoup
                soup = BeautifulSoup(driver.page_source, "html.parser")

                # Find all review elements using the passed review_tag, author_tag, and rating_tag
                review_elements = soup.select(review_tag)
                author_elements = soup.select(author_tag)
                rating_elements = soup.select(rating_tag)

                # Extract reviews, authors, and ratings
                for review, author, rating in zip(review_elements, author_elements, rating_elements):
                    review_text = review.get_text(strip=True)
                    author_name = author.get_text(strip=True)
                    rating_score = rating.get("data-score", "N/A")  # Extract 'data-score' or use 'N/A' if missing
                    all_reviews.append({
                        "review": review_text,
                        "author": author_name,
                        "rating": rating_score
                    })

                print(f"Page {page_number} scraped successfully with {len(review_elements)} reviews.")
                page_number += 1

                # Scroll the "Next Page" button into view and click it
                try:
                    next_page_button = driver.find_element(By.CSS_SELECTOR, next_page_button_tag)
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_page_button)
                    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, next_page_button_tag))).click()
                    time.sleep(2)  # Small pause to ensure the page loads
                except (NoSuchElementException, TimeoutException):
                    print("No more pages to scrape or 'Next Page' button not found.")
                    break
                except ElementClickInterceptedException as e:
                    print(f"Click intercepted on page {page_number}: {str(e)}")
                    ActionChains(driver).move_to_element_with_offset(next_page_button, 0, 0).click().perform()
            except Exception as e:
                print(f"Error scraping page {page_number}: {str(e)}")
                break

    finally:
        # Ensure the browser is closed properly
        driver.quit()

    return all_reviews

In [45]:
def main(url):
    # Check if Ollama server is running
    try:
        requests.get("http://localhost:11434/api/version")
    except requests.exceptions.ConnectionError:
        print("Error: Cannot connect to Ollama server. Please ensure it's running on localhost:11434")
        return

    # First, fetch all CSS selectors from the page
    print("Fetching CSS selectors from the page...")
    css_selectors = fetch_css_selectors(url)
    print(f"Found {len(css_selectors)} unique CSS selectors")
    
    # Get tag suggestions from Ollama
    print("\nAnalyzing selectors with Ollama...")
    try:
        tag_suggestions = get_tag_suggestions(css_selectors)
        print("\nOllama suggested these selectors:")
        print(json.dumps(tag_suggestions, indent=2))
        
        # Allow user to modify suggestions if needed
        print("\nWould you like to:")
        print("1. Proceed with these selectors")
        print("2. Modify the selectors")
        print("3. Abort")
        choice = input("Enter your choice (1/2/3): ")
        
        if choice == "2":
            print("\nEnter new selectors (press Enter to keep existing one):")
            for key, value in tag_suggestions.items():
                new_value = input(f"{key} [{value}]: ").strip()
                if new_value:
                    tag_suggestions[key] = new_value
        elif choice == "3":
            print("Aborting scraping process.")
            return
        elif choice != "1":
            print("Invalid choice. Aborting.")
            return
        
        # Fetch reviews using the suggested tags
        print("\nStarting review collection...")
        reviews = fetch_all_reviews(
            url,
            tag_suggestions['review_tag'],
            tag_suggestions['author_tag'],
            tag_suggestions['rating_tag'],
            tag_suggestions['next_page_button_tag']
        )
        
        # Print reviews
        print(f"\nCollected {len(reviews)} reviews:")
        for idx, review in enumerate(reviews, 1):
            print(f"\nReview {idx}:")
            print(f"Author: {review['author']}")
            print(f"Rating: {review['rating']} stars")
            print(f"Review: {review['review']}")
            print("-" * 50)
        
        # Offer to save reviews to file
        save = input("\nWould you like to save the reviews to a file? (y/n): ")
        if save.lower() == 'y':
            filename = input("Enter filename (default: reviews.json): ").strip() or "reviews.json"
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(reviews, f, indent=2, ensure_ascii=False)
            print(f"Reviews saved to {filename}")
            
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Please check the selectors and try again.")

if __name__ == "__main__":
    # Instructions for setting up Ollama
    print("Before running this script, make sure:")
    print("1. Ollama is installed (https://ollama.ai/download)")
    print("2. You've pulled the llama3.2 model: Run 'ollama pull llama3.2'")
    print("3. Ollama server is running: Run 'ollama serve' in a terminal")
    print("\nPress Enter when ready, or Ctrl+C to exit")
    input()
    
    url = input("Enter the URL to scrape: ")
    main(url)

Before running this script, make sure:
1. Ollama is installed (https://ollama.ai/download)
2. You've pulled the llama3.2 model: Run 'ollama pull llama3.2'
3. Ollama server is running: Run 'ollama serve' in a terminal

Press Enter when ready, or Ctrl+C to exit
Fetching CSS selectors from the page...
Found 315 unique CSS selectors

Analyzing selectors with Ollama...

Ollama suggested these selectors:
{
  "review_tag": ".jdgm-rev-widg__content",
  "author_tag": ".jdgm-rev__author",
  "rating_tag": ".jdgm-rev__rating",
  "next_page_button_tag": ".pagination"
}

Would you like to:
1. Proceed with these selectors
2. Modify the selectors
3. Abort

Enter new selectors (press Enter to keep existing one):

Starting review collection...
Error scraping page 1: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=131.0.6778.265)
Stacktrace:
	GetHandleVerifier [0x00007FF6CC2680D5+2992373]
	(No symbol) [0x00007FF6CBEFBFD0]
	(No symbol) [