In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import *
import json
import re
import time

In [2]:
def extract_text_from_url(url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an error if the request fails

        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract all text from the webpage
        page_text = soup.get_text()

        # Clean up the text by removing excessive whitespace
        clean_text = "\n".join(line.strip() for line in page_text.splitlines() if line.strip())

        return clean_text
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

In [3]:
def extract_json_from_text(text):
    """Extract JSON from text response, with multiple fallback methods"""
    # First try: direct JSON parsing
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Second try: find JSON-like structure
    try:
        json_match = re.search(r'\[.*\]', text, re.DOTALL)
        if json_match:
            return json.loads(json_match.group())
    except (json.JSONDecodeError, AttributeError):
        pass

    # Third try: extract individual review objects
    try:
        reviews = []
        matches = re.finditer(r'\{\s*"review":[^}]+\}', text, re.DOTALL)
        for match in matches:
            try:
                review = json.loads(match.group())
                reviews.append(review)
            except json.JSONDecodeError:
                continue
        if reviews:
            return reviews
    except Exception:
        pass

    return []

In [4]:
def get_page_content(url):
    """
    Scrapes content from a single page.
    Returns the HTML content of the page.
    """
    service = Service("chromedriver.exe")  # Update path as needed
    driver = webdriver.Chrome(service=service)
    driver.maximize_window()
    page_content = ""

    try:
        driver.get(url)
        wait = WebDriverWait(driver, 10)

        # Wait for content to load
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # Get page content
        page_content = driver.page_source
        print("Scraped page content successfully")

    except Exception as e:
        print(f"Error scraping page: {str(e)}")

    finally:
        driver.quit()

    print("Page Content:", page_content[:500])

    return page_content

In [5]:
def query_ollama(prompt):
    """Send a query to local Ollama server with improved response handling"""
    url = "http://localhost:11434/api/generate"

    payload = {
        "model": "llama3.2",
        "prompt": prompt,
        "stream": False
    }

    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()

        # Debug: print the response text
        print("Debug - Ollama raw response:", response.text)

        result = response.json()
        if 'response' not in result:
            print("Debug - Unexpected Ollama response format:", result)
            raise Exception("Unexpected response format from Ollama")

        return result['response']

    except Exception as e:
        print(f"Debug - Ollama API error: {str(e)}")
        raise

In [6]:
def safe_split_text(text, max_chunk_size):
    """
    Splits text into chunks, ensuring JSON structures are not broken.
    """
    chunks = []
    current_chunk = ""
    for line in text.splitlines():
        if len(current_chunk) + len(line) + 1 > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = line
        else:
            current_chunk += "\n" + line
    if current_chunk.strip():
        chunks.append(current_chunk)
    return chunks

In [7]:
def find_pagination_selector(url):
    """
    Asks Ollama to identify the pagination selector from the page.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Get all elements that might be pagination buttons
    potential_elements = soup.find_all(['a', 'button', 'div'], class_=lambda x: x and ('next' in x.lower() or 'pagination' in x.lower()))
    selectors = []
    for element in potential_elements:
        if element.get('class'):
            selectors.append(f".{'.'.join(element.get('class'))}")
        if element.get('id'):
            selectors.append(f"#{element.get('id')}")
    
    if not selectors:
        return None

    prompt = f"""Analyze these potential pagination selectors and identify which one is most likely the 'Next Page' button:
{json.dumps(selectors, indent=2)}



Return only the most likely selector as a string, no other text."""
    try:
        response = query_ollama(prompt)
        return response.strip().strip('"').strip("'")
    except Exception:
        print("Debug - Ollama failed to identify the selector. Manual input required.")
        return input("Enter the pagination selector manually (CSS format): ")

In [8]:
def extract_reviews_from_content(page_content):
    """
    Asks Ollama to extract reviews from the page content
    """
    # Clean the HTML content first
    soup = BeautifulSoup(page_content, "html.parser")

    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()

    # Get text content
    text = soup.get_text()
    if not text.strip():
        print("Debug - Empty text content after cleaning HTML.")
        return []

    # Clean up whitespace
    lines = (line.strip() for line in text.splitlines())
    text = ' '.join(line for line in lines if line)
    print("Debug - Extracted text:", text[:500])  # Print first 500 characters

    # Prepare prompt
    prompt = """Extract all reviews from this webpage content. For each review, identify:
    1. The review text
    2. The author/username
    3. The rating (if present)

    Return the results as a JSON array of objects, each with 'review', 'author', and 'rating' fields. Example format:
    [
        {
            "review": "review text here",
            "author": "username here",
            "rating": "5"
        }
    ]

    Webpage content:
    """
    print("Prepared Prompt for Ollama:", prompt[:500])  # Check the first 500 characters of the prompt

    # Split content into chunks if too long
    max_chunk_size = 4000
    chunks = safe_split_text(text, max_chunk_size)

    all_reviews = []
    for chunk in chunks:
        try:
            response = query_ollama(prompt + chunk)
            chunk_reviews = json.loads(response)  # Add a check to verify the response is valid JSON
            all_reviews.extend(chunk_reviews)
        except json.JSONDecodeError as e:
            print(f"Error processing chunk (JSON decode error): {str(e)}")
            continue
        except Exception as e:
            print(f"Error processing chunk: {str(e)}")
            continue

    return all_reviews

In [9]:
def save_reviews(reviews, filename="reviews.json"):
    """
    Save the reviews to a JSON file
    """
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(reviews, f, indent=2, ensure_ascii=False)

In [10]:
# def main():
#     print("Before running this script, make sure:")
#     print("1. Ollama is installed (https://ollama.ai/download)")
#     print("2. You've pulled the llama3.2 model: Run 'ollama pull llama3.2'")
#     print("3. Ollama server is running: Run 'ollama serve' in a terminal")
#     print("\nPress Enter when ready, or Ctrl+C to exit")
#     input()

#     url = input("Enter the URL to scrape: ")

#     try:
#         # Scrape the page content
#         print("\nScraping page content...")
#         page_content = get_page_content(url)

#         # Extract reviews from the page
#         print("\nExtracting reviews from content...")
#         reviews = extract_reviews_from_content(page_content)

#         # Print results
#         print(f"\nFound {len(reviews)} reviews:")
#         for idx, review in enumerate(reviews, 1):
#             print(f"\nReview {idx}:")
#             print(f"Author: {review['author']}")
#             print(f"Rating: {review.get('rating', 'N/A')} stars")
#             print(f"Review: {review['review']}")
#             print("-" * 50)

#         # Save results
#         save = input("\nWould you like to save the reviews to a file? (y/n): ")
#         if save.lower() == 'y':
#             filename = input("Enter filename (default: reviews.json): ").strip() or "reviews.json"
#             save_reviews(reviews, filename)
#             print(f"Reviews saved to {filename}")

#     except Exception as e:
#         print(f"Error: {str(e)}")
#         print("Please check the URL and try again.")

# if __name__ == "__main__":
#     main()

In [None]:
def main():
    url = input("Enter the URL to scrape: ")

    # Extract page text from the URL
    page_content = extract_text_from_url(url)
    if page_content:
        print("Page content extracted successfully.")

        # Extract reviews from the page content
        reviews = extract_reviews_from_content(page_content)

        if reviews:
            print(f"\nFound {len(reviews)} reviews:")
            for idx, review in enumerate(reviews, 1):
                print(f"\nReview {idx}:")
                print(f"Author: {review['author']}")
                print(f"Rating: {review.get('rating', 'N/A')} stars")
                print(f"Review: {review['review']}")
                print("-" * 50)

            # Ask user if they want to save the reviews
            save = input("\nWould you like to save the reviews to a file? (y/n): ")
            if save.lower() == 'y':
                filename = input("Enter filename (default: reviews.json): ").strip() or "reviews.json"
                save_reviews(reviews, filename)
                print(f"Reviews saved to {filename}")
        else:
            print("No reviews found.")
    else:
        print("Failed to extract page content.")

if __name__ == "__main__":
    main()

Page content extracted successfully.
Debug - Extracted text: 27:17 | Relief & Recovery Cream Skip to content 0 Close Back Home Shop Now Our Mission Ambassador Application Wholesale Partnerships Close 0 products in your cart 0 Total: $0.00 Shipping & taxes calculated at checkout View Cart Update cart Checkout          One or more of the items in your cart is a recurring or deferred purchase. By continuing, I agree to the cancellation policy and authorize you to charge my payment method at the prices, frequency and dates listed on this page until my order i
Prepared Prompt for Ollama: Extract all reviews from this webpage content. For each review, identify:
    1. The review text
    2. The author/username
    3. The rating (if present)

    Return the results as a JSON array of objects, each with 'review', 'author', and 'rating' fields. Example format:
    [
        {
            "review": "review text here",
            "author": "username here",
            "rating": "5"
        }
   